diff --git "a/Random_forest.py/Random_forest_ver3.ipynb" "b/Random_forest.py/Random_forest_ver3.ipynb" new file mode 100644--- /dev/null +++ "b/Random_forest.py/Random_forest_ver3.ipynb" @@ -0,0 +1,3819 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "7ec041c8", + "metadata": { + "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", + "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", + "execution": { + "iopub.execute_input": "2023-07-24T16:44:11.827497Z", + "iopub.status.busy": "2023-07-24T16:44:11.827091Z", + "iopub.status.idle": "2023-07-24T16:44:11.842480Z", + "shell.execute_reply": "2023-07-24T16:44:11.841069Z" + }, + "papermill": { + "duration": 0.038107, + "end_time": "2023-07-24T16:44:11.844975", + "exception": false, + "start_time": "2023-07-24T16:44:11.806868", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv\n", + "/kaggle/input/icr-identify-age-related-conditions/greeks.csv\n", + "/kaggle/input/icr-identify-age-related-conditions/train.csv\n", + "/kaggle/input/icr-identify-age-related-conditions/test.csv\n" + ] + } + ], + "source": [ + "# This Python 3 environment comes with many helpful analytics libraries installed\n", + "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n", + "# For example, here's several helpful packages to load\n", + "\n", + "import numpy as np # linear algebra\n", + "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", + "\n", + "# Input data files are available in the read-only \"../input/\" directory\n", + "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n", + "\n", + "import os\n", + "for dirname, _, filenames in os.walk('/kaggle/input'):\n", + " for filename in filenames:\n", + " print(os.path.join(dirname, filename))\n", + "\n", + "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n", + "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bf665cc3", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:11.884566Z", + "iopub.status.busy": "2023-07-24T16:44:11.883586Z", + "iopub.status.idle": "2023-07-24T16:44:14.200337Z", + "shell.execute_reply": "2023-07-24T16:44:14.199061Z" + }, + "papermill": { + "duration": 2.340334, + "end_time": "2023-07-24T16:44:14.203977", + "exception": false, + "start_time": "2023-07-24T16:44:11.863643", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5\n", + " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n" + ] + } + ], + "source": [ + "from fastai.imports import *\n", + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "pd.set_option('display.max_columns', None)\n", + "pd.set_option('display.max_rows', None)\n", + "\n", + "%matplotlib inline\n", + "import seaborn as sns\n", + "from numpy import random\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import mean_absolute_error\n", + "from sklearn.tree import DecisionTreeClassifier, export_graphviz\n", + "import graphviz\n", + "from sklearn.ensemble import RandomForestClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c3b90dfc", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:14.245013Z", + "iopub.status.busy": "2023-07-24T16:44:14.243722Z", + "iopub.status.idle": "2023-07-24T16:44:14.311313Z", + "shell.execute_reply": "2023-07-24T16:44:14.309839Z" + }, + "papermill": { + "duration": 0.091846, + "end_time": "2023-07-24T16:44:14.315134", + "exception": false, + "start_time": "2023-07-24T16:44:14.223288", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "train_df = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/train.csv\")\n", + "test_df = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/test.csv\")\n", + "greeks_df = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/greeks.csv\")\n", + "sample_submission_df = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ee096853", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:14.353515Z", + "iopub.status.busy": "2023-07-24T16:44:14.353113Z", + "iopub.status.idle": "2023-07-24T16:44:14.424332Z", + "shell.execute_reply": "2023-07-24T16:44:14.423047Z" + }, + "papermill": { + "duration": 0.09367, + "end_time": "2023-07-24T16:44:14.427193", + "exception": false, + "start_time": "2023-07-24T16:44:14.333523", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdABAFAHAMARAXAYAZBCBDBNBPBQBRBZCBCCCDCFCHCLCRCSCUCWDADEDFDHDIDLDNDUDVDYEBEEEGEHEJELEPEUFCFDFEFIFLFRFSGBGEGFGHGIGLClass
0000ff2bfdfe90.2093773109.0332985.20014722.3944078.1386880.6998610.0255789.8122145.5556344126.5873122.5984175.638726152.707705823.928241257.43237747.2233580.56348123.3876004.8519150.0234821.0502250.06922513.7841111.30201236.20595669.08340295.5705750.238680.28423289.24556084.3166429.6571045.3106901.7430723.1877047.2941761.9872831433.1667500.949104B30.87942078.5269683.82838413.39464010.2650739028.2919213.5834507.2981621.738550.09482211.33913872.6110632003.81031922.13622969.8349440.1203431
1007255e476980.145282978.7641685.20014736.9688898.1386883.6321900.02557813.5177901.2299005496.9282419.4205155.86803014.75472051.216883257.43237730.2843450.48471050.6282086.0850410.0314421.1138751.11780028.3109531.35718237.47656870.79836178.5531000.238680.363489110.58181575.7454837.5320000.0055181.7430717.2223284.9263960.8586031111.2871500.003042A109.12515995.41508652.26048017.1759840.2968506785.00347410.3589270.1732290.497060.5689329.29269872.61106327981.56275029.13543032.13199621.9780000
2013f2bd269f50.4700302635.1065485.20014732.3605538.1386886.7328400.02557812.8245701.2299005135.7802426.4825128.988531219.320160482.141594257.43237732.5637130.49585285.9553765.3764880.0362181.0502250.70035039.3647431.00961121.45964470.81970321.4266250.238680.210441120.05643865.4698428.0534641.2897391.7430736.8613527.8136748.1466511494.0764880.377208B109.12515978.5269685.390628224.2074248.7452018338.90618111.6269177.7095600.975561.19882137.07777288.60943713676.95781028.02285135.1926760.1969410
3043ac50845d50.2521073819.65177120.20161877.1122038.1386883.6853440.02557811.0537081.2299004169.6773823.6577237.28226411.050410661.518640257.43237715.2019140.71788288.1593602.3476520.0290541.4003000.63607541.1169600.72272721.53039247.27586196.6079850.238680.292431139.82457071.5712024.3548562.6553451.7430752.0038847.3860603.81332615691.5521800.614484B31.67435778.52696831.32337259.3019847.88433610965.76604014.8520226.1221620.497060.28446618.52958482.4168032094.26245239.94865690.4932480.1558290
4044fb8a146ec0.3802973733.0484485.20014714.1037388.1386883.9422550.0548103.396778102.1519805728.7341224.0108324.546318149.7171656074.859475257.43237782.2134950.53646772.64426430.5377220.0254721.0502250.69315031.7247260.82755034.41536074.06532200.1781600.238680.20770897.92012052.8388826.0199121.1449021.743079.0648567.3507203.4908461403.6563000.164268B109.12515991.99482551.14133629.1026404.27464016198.04959013.6667278.15305848.501340.12191416.408728146.1099438524.37050245.38131636.2626280.0966141
\n", + "
" + ], + "text/plain": [ + " Id AB AF AH AM AR \\\n", + "0 000ff2bfdfe9 0.209377 3109.03329 85.200147 22.394407 8.138688 \n", + "1 007255e47698 0.145282 978.76416 85.200147 36.968889 8.138688 \n", + "2 013f2bd269f5 0.470030 2635.10654 85.200147 32.360553 8.138688 \n", + "3 043ac50845d5 0.252107 3819.65177 120.201618 77.112203 8.138688 \n", + "4 044fb8a146ec 0.380297 3733.04844 85.200147 14.103738 8.138688 \n", + "\n", + " AX AY AZ BC BD BN BP \\\n", + "0 0.699861 0.025578 9.812214 5.555634 4126.58731 22.5984 175.638726 \n", + "1 3.632190 0.025578 13.517790 1.229900 5496.92824 19.4205 155.868030 \n", + "2 6.732840 0.025578 12.824570 1.229900 5135.78024 26.4825 128.988531 \n", + "3 3.685344 0.025578 11.053708 1.229900 4169.67738 23.6577 237.282264 \n", + "4 3.942255 0.054810 3.396778 102.151980 5728.73412 24.0108 324.546318 \n", + "\n", + " BQ BR BZ CB CC CD \\\n", + "0 152.707705 823.928241 257.432377 47.223358 0.563481 23.387600 \n", + "1 14.754720 51.216883 257.432377 30.284345 0.484710 50.628208 \n", + "2 219.320160 482.141594 257.432377 32.563713 0.495852 85.955376 \n", + "3 11.050410 661.518640 257.432377 15.201914 0.717882 88.159360 \n", + "4 149.717165 6074.859475 257.432377 82.213495 0.536467 72.644264 \n", + "\n", + " CF CH CL CR CS CU CW \\\n", + "0 4.851915 0.023482 1.050225 0.069225 13.784111 1.302012 36.205956 \n", + "1 6.085041 0.031442 1.113875 1.117800 28.310953 1.357182 37.476568 \n", + "2 5.376488 0.036218 1.050225 0.700350 39.364743 1.009611 21.459644 \n", + "3 2.347652 0.029054 1.400300 0.636075 41.116960 0.722727 21.530392 \n", + "4 30.537722 0.025472 1.050225 0.693150 31.724726 0.827550 34.415360 \n", + "\n", + " DA DE DF DH DI DL DN \\\n", + "0 69.08340 295.570575 0.23868 0.284232 89.245560 84.31664 29.657104 \n", + "1 70.79836 178.553100 0.23868 0.363489 110.581815 75.74548 37.532000 \n", + "2 70.81970 321.426625 0.23868 0.210441 120.056438 65.46984 28.053464 \n", + "3 47.27586 196.607985 0.23868 0.292431 139.824570 71.57120 24.354856 \n", + "4 74.06532 200.178160 0.23868 0.207708 97.920120 52.83888 26.019912 \n", + "\n", + " DU DV DY EB EE EG EH \\\n", + "0 5.310690 1.74307 23.187704 7.294176 1.987283 1433.166750 0.949104 \n", + "1 0.005518 1.74307 17.222328 4.926396 0.858603 1111.287150 0.003042 \n", + "2 1.289739 1.74307 36.861352 7.813674 8.146651 1494.076488 0.377208 \n", + "3 2.655345 1.74307 52.003884 7.386060 3.813326 15691.552180 0.614484 \n", + "4 1.144902 1.74307 9.064856 7.350720 3.490846 1403.656300 0.164268 \n", + "\n", + " EJ EL EP EU FC FD FE \\\n", + "0 B 30.879420 78.526968 3.828384 13.394640 10.265073 9028.291921 \n", + "1 A 109.125159 95.415086 52.260480 17.175984 0.296850 6785.003474 \n", + "2 B 109.125159 78.526968 5.390628 224.207424 8.745201 8338.906181 \n", + "3 B 31.674357 78.526968 31.323372 59.301984 7.884336 10965.766040 \n", + "4 B 109.125159 91.994825 51.141336 29.102640 4.274640 16198.049590 \n", + "\n", + " FI FL FR FS GB GE \\\n", + "0 3.583450 7.298162 1.73855 0.094822 11.339138 72.611063 \n", + "1 10.358927 0.173229 0.49706 0.568932 9.292698 72.611063 \n", + "2 11.626917 7.709560 0.97556 1.198821 37.077772 88.609437 \n", + "3 14.852022 6.122162 0.49706 0.284466 18.529584 82.416803 \n", + "4 13.666727 8.153058 48.50134 0.121914 16.408728 146.109943 \n", + "\n", + " GF GH GI GL Class \n", + "0 2003.810319 22.136229 69.834944 0.120343 1 \n", + "1 27981.562750 29.135430 32.131996 21.978000 0 \n", + "2 13676.957810 28.022851 35.192676 0.196941 0 \n", + "3 2094.262452 39.948656 90.493248 0.155829 0 \n", + "4 8524.370502 45.381316 36.262628 0.096614 1 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0f8cad4e", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:14.468316Z", + "iopub.status.busy": "2023-07-24T16:44:14.466605Z", + "iopub.status.idle": "2023-07-24T16:44:14.670578Z", + "shell.execute_reply": "2023-07-24T16:44:14.669435Z" + }, + "papermill": { + "duration": 0.227361, + "end_time": "2023-07-24T16:44:14.673619", + "exception": false, + "start_time": "2023-07-24T16:44:14.446258", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABAFAHAMARAXAYAZBCBDBNBPBQBRBZCBCCCDCFCHCLCRCSCUCWDADEDFDHDIDLDNDUDVDYEBEEEGEHELEPEUFCFDFEFIFLFRFSGBGEGFGHGIGLClass
count
mean0.4771493502.013221118.62451338.96855210.1282425.5455760.06032010.5664478.0530125350.38865521.419492231.32222398.3287371218.133238550.63252577.1041510.68880190.25173511.2410640.0306151.4037610.74226236.9175901.38379227.16565351.128326401.9012990.6338840.367002146.97209994.79537726.3705681.8029001.92483026.3889899.0727003.0647781731.2482150.30510769.582596105.06071269.11700571.3415266.93008610306.81073710.1110795.4331993.5339050.42150120.724856131.71498714679.59539831.48971650.5844378.5309610.175041
std0.4683882300.322717127.83895069.72822610.5188772.5516960.4168174.35064565.1669433021.3266413.478278183.99250596.4793717575.2937072076.371275159.0493020.26399451.58513013.5711330.0148081.9222100.28119517.2663470.53871714.64599321.210888317.7456231.9123840.11298986.08441928.2431878.0388259.0347211.48455518.1166796.2002812.0583441790.2274761.84749938.55570768.445620390.187057165.55154564.75426211331.2940512.93402511.49625750.1819481.3053659.991907144.18152419352.9593879.86423936.26625110.3270100.380310
min0.081187192.59328085.2001473.1775228.1386880.6998610.0255783.3967781.2299001693.6243209.88680072.9489511.33115551.216883257.43237712.4997600.17687423.3876000.5108880.0031841.0502250.06922513.7841110.1379257.0306406.90640035.9988950.2386800.04099560.23247010.3456006.3394960.0055181.7430700.8040684.9263960.286201185.5941000.0030425.39467578.5269683.8283847.5341280.2968501563.1366883.5834500.1732290.4970600.0677304.10218272.61106313.0388949.4327350.8976280.0011290.000000
25%0.2521072197.34548085.20014712.2703148.1386884.1282940.0255788.1295801.2299004155.70287019.420500156.84723927.834425424.990642257.43237723.3175670.56368864.7241925.0663060.0234821.0502250.58957529.7824671.0702987.03064037.942520188.8156900.2386800.295164102.70355378.23224020.8882640.0055181.74307014.7157925.9653921.6486791111.1606250.00304230.92746878.5269684.32465625.8153840.2968505164.6662608.5230980.1732290.4970600.06773014.03671872.6110632798.99258425.03488823.0116840.1243920.000000
50%0.3546593120.31896085.20014720.5331108.1386885.0319120.02557810.4613201.2299004997.96073021.186000193.90881661.642115627.417402257.43237742.5543300.65871579.8191049.1230000.0278601.0502250.73080034.8351301.35166536.01910449.180940307.5095950.2386800.358023130.05063096.26496025.2488000.2517411.74307021.6424568.1494042.6161191493.8174130.08517671.94930678.52696822.64114436.3940081.8701557345.1434249.9454523.0281411.1310000.25060118.77143672.6110637838.27361030.60894641.0079680.3378270.000000
75%0.5597634361.637390113.73954039.1398868.1386886.4316340.03684512.9695165.0812446035.88570023.657700247.803462134.009015975.649259257.43237777.3100970.77220699.81352013.5659010.0344271.2284450.85935040.5294011.66061737.93583261.408760507.8962000.2386800.426348165.836955110.64068030.5442241.0586901.74307034.05834410.5030483.9100701905.7014750.237276109.125159112.76665449.08535256.7144484.88021410647.95165011.5166576.2388141.5120600.53506725.608406127.59167119035.70924036.86394767.93166421.9780000.000000
max6.16166628688.1876601910.123198630.518230178.94363438.27088010.31585138.9715681463.69344853060.59924029.3073002447.810550344.644105179250.25290050092.4593002271.4361674.103032633.534408200.9675260.22407431.6881533.039675267.9428234.95150764.521624210.3309202103.40519037.8950131.0604041049.168078326.23620062.808096161.35531525.192930152.35516494.95858018.32492630243.75878042.569748109.1251591063.5945786501.2644803030.6558241578.654237143224.68230035.851039137.9327391244.22702031.365763135.7812941497.351958143790.07120081.210825191.19476421.9780001.000000
\n", + "
" + ], + "text/plain": [ + " AB AF AH AM AR \\\n", + "count 617.000000 617.000000 617.000000 617.000000 617.000000 \n", + "mean 0.477149 3502.013221 118.624513 38.968552 10.128242 \n", + "std 0.468388 2300.322717 127.838950 69.728226 10.518877 \n", + "min 0.081187 192.593280 85.200147 3.177522 8.138688 \n", + "25% 0.252107 2197.345480 85.200147 12.270314 8.138688 \n", + "50% 0.354659 3120.318960 85.200147 20.533110 8.138688 \n", + "75% 0.559763 4361.637390 113.739540 39.139886 8.138688 \n", + "max 6.161666 28688.187660 1910.123198 630.518230 178.943634 \n", + "\n", + " AX AY AZ BC BD \\\n", + "count 617.000000 617.000000 617.000000 617.000000 617.000000 \n", + "mean 5.545576 0.060320 10.566447 8.053012 5350.388655 \n", + "std 2.551696 0.416817 4.350645 65.166943 3021.326641 \n", + "min 0.699861 0.025578 3.396778 1.229900 1693.624320 \n", + "25% 4.128294 0.025578 8.129580 1.229900 4155.702870 \n", + "50% 5.031912 0.025578 10.461320 1.229900 4997.960730 \n", + "75% 6.431634 0.036845 12.969516 5.081244 6035.885700 \n", + "max 38.270880 10.315851 38.971568 1463.693448 53060.599240 \n", + "\n", + " BN BP BQ BR BZ \\\n", + "count 617.000000 617.000000 557.000000 617.000000 617.000000 \n", + "mean 21.419492 231.322223 98.328737 1218.133238 550.632525 \n", + "std 3.478278 183.992505 96.479371 7575.293707 2076.371275 \n", + "min 9.886800 72.948951 1.331155 51.216883 257.432377 \n", + "25% 19.420500 156.847239 27.834425 424.990642 257.432377 \n", + "50% 21.186000 193.908816 61.642115 627.417402 257.432377 \n", + "75% 23.657700 247.803462 134.009015 975.649259 257.432377 \n", + "max 29.307300 2447.810550 344.644105 179250.252900 50092.459300 \n", + "\n", + " CB CC CD CF CH \\\n", + "count 615.000000 614.000000 617.000000 617.000000 617.000000 \n", + "mean 77.104151 0.688801 90.251735 11.241064 0.030615 \n", + "std 159.049302 0.263994 51.585130 13.571133 0.014808 \n", + "min 12.499760 0.176874 23.387600 0.510888 0.003184 \n", + "25% 23.317567 0.563688 64.724192 5.066306 0.023482 \n", + "50% 42.554330 0.658715 79.819104 9.123000 0.027860 \n", + "75% 77.310097 0.772206 99.813520 13.565901 0.034427 \n", + "max 2271.436167 4.103032 633.534408 200.967526 0.224074 \n", + "\n", + " CL CR CS CU CW DA \\\n", + "count 617.000000 617.000000 617.000000 617.000000 617.000000 617.000000 \n", + "mean 1.403761 0.742262 36.917590 1.383792 27.165653 51.128326 \n", + "std 1.922210 0.281195 17.266347 0.538717 14.645993 21.210888 \n", + "min 1.050225 0.069225 13.784111 0.137925 7.030640 6.906400 \n", + "25% 1.050225 0.589575 29.782467 1.070298 7.030640 37.942520 \n", + "50% 1.050225 0.730800 34.835130 1.351665 36.019104 49.180940 \n", + "75% 1.228445 0.859350 40.529401 1.660617 37.935832 61.408760 \n", + "max 31.688153 3.039675 267.942823 4.951507 64.521624 210.330920 \n", + "\n", + " DE DF DH DI DL \\\n", + "count 617.000000 617.000000 617.000000 617.000000 617.000000 \n", + "mean 401.901299 0.633884 0.367002 146.972099 94.795377 \n", + "std 317.745623 1.912384 0.112989 86.084419 28.243187 \n", + "min 35.998895 0.238680 0.040995 60.232470 10.345600 \n", + "25% 188.815690 0.238680 0.295164 102.703553 78.232240 \n", + "50% 307.509595 0.238680 0.358023 130.050630 96.264960 \n", + "75% 507.896200 0.238680 0.426348 165.836955 110.640680 \n", + "max 2103.405190 37.895013 1.060404 1049.168078 326.236200 \n", + "\n", + " DN DU DV DY EB EE \\\n", + "count 617.000000 616.000000 617.000000 617.000000 617.000000 617.000000 \n", + "mean 26.370568 1.802900 1.924830 26.388989 9.072700 3.064778 \n", + "std 8.038825 9.034721 1.484555 18.116679 6.200281 2.058344 \n", + "min 6.339496 0.005518 1.743070 0.804068 4.926396 0.286201 \n", + "25% 20.888264 0.005518 1.743070 14.715792 5.965392 1.648679 \n", + "50% 25.248800 0.251741 1.743070 21.642456 8.149404 2.616119 \n", + "75% 30.544224 1.058690 1.743070 34.058344 10.503048 3.910070 \n", + "max 62.808096 161.355315 25.192930 152.355164 94.958580 18.324926 \n", + "\n", + " EG EH EL EP EU \\\n", + "count 617.000000 617.000000 557.000000 617.000000 617.000000 \n", + "mean 1731.248215 0.305107 69.582596 105.060712 69.117005 \n", + "std 1790.227476 1.847499 38.555707 68.445620 390.187057 \n", + "min 185.594100 0.003042 5.394675 78.526968 3.828384 \n", + "25% 1111.160625 0.003042 30.927468 78.526968 4.324656 \n", + "50% 1493.817413 0.085176 71.949306 78.526968 22.641144 \n", + "75% 1905.701475 0.237276 109.125159 112.766654 49.085352 \n", + "max 30243.758780 42.569748 109.125159 1063.594578 6501.264480 \n", + "\n", + " FC FD FE FI FL \\\n", + "count 616.000000 617.000000 617.000000 617.000000 616.000000 \n", + "mean 71.341526 6.930086 10306.810737 10.111079 5.433199 \n", + "std 165.551545 64.754262 11331.294051 2.934025 11.496257 \n", + "min 7.534128 0.296850 1563.136688 3.583450 0.173229 \n", + "25% 25.815384 0.296850 5164.666260 8.523098 0.173229 \n", + "50% 36.394008 1.870155 7345.143424 9.945452 3.028141 \n", + "75% 56.714448 4.880214 10647.951650 11.516657 6.238814 \n", + "max 3030.655824 1578.654237 143224.682300 35.851039 137.932739 \n", + "\n", + " FR FS GB GE GF \\\n", + "count 617.000000 615.000000 617.000000 617.000000 617.000000 \n", + "mean 3.533905 0.421501 20.724856 131.714987 14679.595398 \n", + "std 50.181948 1.305365 9.991907 144.181524 19352.959387 \n", + "min 0.497060 0.067730 4.102182 72.611063 13.038894 \n", + "25% 0.497060 0.067730 14.036718 72.611063 2798.992584 \n", + "50% 1.131000 0.250601 18.771436 72.611063 7838.273610 \n", + "75% 1.512060 0.535067 25.608406 127.591671 19035.709240 \n", + "max 1244.227020 31.365763 135.781294 1497.351958 143790.071200 \n", + "\n", + " GH GI GL Class \n", + "count 617.000000 617.000000 616.000000 617.000000 \n", + "mean 31.489716 50.584437 8.530961 0.175041 \n", + "std 9.864239 36.266251 10.327010 0.380310 \n", + "min 9.432735 0.897628 0.001129 0.000000 \n", + "25% 25.034888 23.011684 0.124392 0.000000 \n", + "50% 30.608946 41.007968 0.337827 0.000000 \n", + "75% 36.863947 67.931664 21.978000 0.000000 \n", + "max 81.210825 191.194764 21.978000 1.000000 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9d4496a1", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:14.716229Z", + "iopub.status.busy": "2023-07-24T16:44:14.715836Z", + "iopub.status.idle": "2023-07-24T16:44:14.936888Z", + "shell.execute_reply": "2023-07-24T16:44:14.934929Z" + }, + "papermill": { + "duration": 0.247879, + "end_time": "2023-07-24T16:44:14.942040", + "exception": false, + "start_time": "2023-07-24T16:44:14.694161", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "train_df.Class.value_counts().plot.pie()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0224fd32", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:15.009821Z", + "iopub.status.busy": "2023-07-24T16:44:15.009423Z", + "iopub.status.idle": "2023-07-24T16:44:15.019078Z", + "shell.execute_reply": "2023-07-24T16:44:15.018230Z" + }, + "papermill": { + "duration": 0.034162, + "end_time": "2023-07-24T16:44:15.021361", + "exception": false, + "start_time": "2023-07-24T16:44:14.987199", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Id 0\n", + "AB 0\n", + "AF 0\n", + "AH 0\n", + "AM 0\n", + "AR 0\n", + "AX 0\n", + "AY 0\n", + "AZ 0\n", + "BC 0\n", + "BD 0\n", + "BN 0\n", + "BP 0\n", + "BQ 60\n", + "BR 0\n", + "BZ 0\n", + "CB 2\n", + "CC 3\n", + "CD 0\n", + "CF 0\n", + "CH 0\n", + "CL 0\n", + "CR 0\n", + "CS 0\n", + "CU 0\n", + "CW 0\n", + "DA 0\n", + "DE 0\n", + "DF 0\n", + "DH 0\n", + "DI 0\n", + "DL 0\n", + "DN 0\n", + "DU 1\n", + "DV 0\n", + "DY 0\n", + "EB 0\n", + "EE 0\n", + "EG 0\n", + "EH 0\n", + "EJ 0\n", + "EL 60\n", + "EP 0\n", + "EU 0\n", + "FC 1\n", + "FD 0\n", + "FE 0\n", + "FI 0\n", + "FL 1\n", + "FR 0\n", + "FS 2\n", + "GB 0\n", + "GE 0\n", + "GF 0\n", + "GH 0\n", + "GI 0\n", + "GL 1\n", + "Class 0\n", + "dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "fa6c03f8", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:15.065115Z", + "iopub.status.busy": "2023-07-24T16:44:15.064712Z", + "iopub.status.idle": "2023-07-24T16:44:15.088971Z", + "shell.execute_reply": "2023-07-24T16:44:15.086866Z" + }, + "papermill": { + "duration": 0.050126, + "end_time": "2023-07-24T16:44:15.092480", + "exception": false, + "start_time": "2023-07-24T16:44:15.042354", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 617 entries, 0 to 616\n", + "Data columns (total 58 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 617 non-null object \n", + " 1 AB 617 non-null float64\n", + " 2 AF 617 non-null float64\n", + " 3 AH 617 non-null float64\n", + " 4 AM 617 non-null float64\n", + " 5 AR 617 non-null float64\n", + " 6 AX 617 non-null float64\n", + " 7 AY 617 non-null float64\n", + " 8 AZ 617 non-null float64\n", + " 9 BC 617 non-null float64\n", + " 10 BD 617 non-null float64\n", + " 11 BN 617 non-null float64\n", + " 12 BP 617 non-null float64\n", + " 13 BQ 557 non-null float64\n", + " 14 BR 617 non-null float64\n", + " 15 BZ 617 non-null float64\n", + " 16 CB 615 non-null float64\n", + " 17 CC 614 non-null float64\n", + " 18 CD 617 non-null float64\n", + " 19 CF 617 non-null float64\n", + " 20 CH 617 non-null float64\n", + " 21 CL 617 non-null float64\n", + " 22 CR 617 non-null float64\n", + " 23 CS 617 non-null float64\n", + " 24 CU 617 non-null float64\n", + " 25 CW 617 non-null float64\n", + " 26 DA 617 non-null float64\n", + " 27 DE 617 non-null float64\n", + " 28 DF 617 non-null float64\n", + " 29 DH 617 non-null float64\n", + " 30 DI 617 non-null float64\n", + " 31 DL 617 non-null float64\n", + " 32 DN 617 non-null float64\n", + " 33 DU 616 non-null float64\n", + " 34 DV 617 non-null float64\n", + " 35 DY 617 non-null float64\n", + " 36 EB 617 non-null float64\n", + " 37 EE 617 non-null float64\n", + " 38 EG 617 non-null float64\n", + " 39 EH 617 non-null float64\n", + " 40 EJ 617 non-null object \n", + " 41 EL 557 non-null float64\n", + " 42 EP 617 non-null float64\n", + " 43 EU 617 non-null float64\n", + " 44 FC 616 non-null float64\n", + " 45 FD 617 non-null float64\n", + " 46 FE 617 non-null float64\n", + " 47 FI 617 non-null float64\n", + " 48 FL 616 non-null float64\n", + " 49 FR 617 non-null float64\n", + " 50 FS 615 non-null float64\n", + " 51 GB 617 non-null float64\n", + " 52 GE 617 non-null float64\n", + " 53 GF 617 non-null float64\n", + " 54 GH 617 non-null float64\n", + " 55 GI 617 non-null float64\n", + " 56 GL 616 non-null float64\n", + " 57 Class 617 non-null int64 \n", + "dtypes: float64(55), int64(1), object(2)\n", + "memory usage: 279.7+ KB\n" + ] + } + ], + "source": [ + "train_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "57aa29b6", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:15.136436Z", + "iopub.status.busy": "2023-07-24T16:44:15.135977Z", + "iopub.status.idle": "2023-07-24T16:44:15.175777Z", + "shell.execute_reply": "2023-07-24T16:44:15.174551Z" + }, + "papermill": { + "duration": 0.065128, + "end_time": "2023-07-24T16:44:15.178796", + "exception": false, + "start_time": "2023-07-24T16:44:15.113668", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "modes = train_df.mode().iloc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "236e131c", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:15.228183Z", + "iopub.status.busy": "2023-07-24T16:44:15.227729Z", + "iopub.status.idle": "2023-07-24T16:44:15.295632Z", + "shell.execute_reply": "2023-07-24T16:44:15.294176Z" + }, + "papermill": { + "duration": 0.094191, + "end_time": "2023-07-24T16:44:15.298576", + "exception": false, + "start_time": "2023-07-24T16:44:15.204385", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def process_data(df):\n", + " df.fillna(modes, inplace=True)\n", + " df[\"EJ\"] = pd.Categorical(df.EJ)\n", + " \n", + "process_data(train_df)\n", + "process_data(test_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4d7d57b7", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:15.342144Z", + "iopub.status.busy": "2023-07-24T16:44:15.341506Z", + "iopub.status.idle": "2023-07-24T16:44:15.361922Z", + "shell.execute_reply": "2023-07-24T16:44:15.360200Z" + }, + "papermill": { + "duration": 0.046219, + "end_time": "2023-07-24T16:44:15.365510", + "exception": false, + "start_time": "2023-07-24T16:44:15.319291", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 617 entries, 0 to 616\n", + "Data columns (total 58 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 617 non-null object \n", + " 1 AB 617 non-null float64 \n", + " 2 AF 617 non-null float64 \n", + " 3 AH 617 non-null float64 \n", + " 4 AM 617 non-null float64 \n", + " 5 AR 617 non-null float64 \n", + " 6 AX 617 non-null float64 \n", + " 7 AY 617 non-null float64 \n", + " 8 AZ 617 non-null float64 \n", + " 9 BC 617 non-null float64 \n", + " 10 BD 617 non-null float64 \n", + " 11 BN 617 non-null float64 \n", + " 12 BP 617 non-null float64 \n", + " 13 BQ 617 non-null float64 \n", + " 14 BR 617 non-null float64 \n", + " 15 BZ 617 non-null float64 \n", + " 16 CB 617 non-null float64 \n", + " 17 CC 617 non-null float64 \n", + " 18 CD 617 non-null float64 \n", + " 19 CF 617 non-null float64 \n", + " 20 CH 617 non-null float64 \n", + " 21 CL 617 non-null float64 \n", + " 22 CR 617 non-null float64 \n", + " 23 CS 617 non-null float64 \n", + " 24 CU 617 non-null float64 \n", + " 25 CW 617 non-null float64 \n", + " 26 DA 617 non-null float64 \n", + " 27 DE 617 non-null float64 \n", + " 28 DF 617 non-null float64 \n", + " 29 DH 617 non-null float64 \n", + " 30 DI 617 non-null float64 \n", + " 31 DL 617 non-null float64 \n", + " 32 DN 617 non-null float64 \n", + " 33 DU 617 non-null float64 \n", + " 34 DV 617 non-null float64 \n", + " 35 DY 617 non-null float64 \n", + " 36 EB 617 non-null float64 \n", + " 37 EE 617 non-null float64 \n", + " 38 EG 617 non-null float64 \n", + " 39 EH 617 non-null float64 \n", + " 40 EJ 617 non-null category\n", + " 41 EL 617 non-null float64 \n", + " 42 EP 617 non-null float64 \n", + " 43 EU 617 non-null float64 \n", + " 44 FC 617 non-null float64 \n", + " 45 FD 617 non-null float64 \n", + " 46 FE 617 non-null float64 \n", + " 47 FI 617 non-null float64 \n", + " 48 FL 617 non-null float64 \n", + " 49 FR 617 non-null float64 \n", + " 50 FS 617 non-null float64 \n", + " 51 GB 617 non-null float64 \n", + " 52 GE 617 non-null float64 \n", + " 53 GF 617 non-null float64 \n", + " 54 GH 617 non-null float64 \n", + " 55 GI 617 non-null float64 \n", + " 56 GL 617 non-null float64 \n", + " 57 Class 617 non-null int64 \n", + "dtypes: category(1), float64(55), int64(1), object(1)\n", + "memory usage: 275.6+ KB\n" + ] + } + ], + "source": [ + "train_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "88337216", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:15.416908Z", + "iopub.status.busy": "2023-07-24T16:44:15.416024Z", + "iopub.status.idle": "2023-07-24T16:44:15.427154Z", + "shell.execute_reply": "2023-07-24T16:44:15.425966Z" + }, + "papermill": { + "duration": 0.040073, + "end_time": "2023-07-24T16:44:15.430393", + "exception": false, + "start_time": "2023-07-24T16:44:15.390320", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(0 B\n", + " 1 A\n", + " 2 B\n", + " 3 B\n", + " 4 B\n", + " Name: EJ, dtype: category\n", + " Categories (2, object): ['A', 'B'],\n", + " 0 1\n", + " 1 0\n", + " 2 1\n", + " 3 1\n", + " 4 1\n", + " dtype: int8)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.EJ.head(), train_df.EJ.cat.codes.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "14fb9aa1", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:15.484517Z", + "iopub.status.busy": "2023-07-24T16:44:15.483724Z", + "iopub.status.idle": "2023-07-24T16:44:15.489773Z", + "shell.execute_reply": "2023-07-24T16:44:15.488952Z" + }, + "papermill": { + "duration": 0.031671, + "end_time": "2023-07-24T16:44:15.492329", + "exception": false, + "start_time": "2023-07-24T16:44:15.460658", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "categoricals = [\"EJ\"]\n", + "dependent = \"Class\"\n", + "conts = [column for column in train_df.columns if not column in categoricals + [dependent] + [\"Id\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c1a60f26", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:15.536967Z", + "iopub.status.busy": "2023-07-24T16:44:15.536243Z", + "iopub.status.idle": "2023-07-24T16:44:16.110589Z", + "shell.execute_reply": "2023-07-24T16:44:16.109397Z" + }, + "papermill": { + "duration": 0.599525, + "end_time": "2023-07-24T16:44:16.113242", + "exception": false, + "start_time": "2023-07-24T16:44:15.513717", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Text(0.5, 1.0, 'Histogram')]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA6QAAAHUCAYAAAAtAuU1AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAABTcUlEQVR4nO3de1xVVf7/8feRy4EUSUG5JJKV11AzLAQz7yiGltaoOWk2WpmXJHRs0CmxKRktFcNL2ahkplKZ1ZSZqHkbsEEStbtNllicUEMQI0Dcvz/8eb4dAQVCtsLr+Xjsx8Oz9trrfPbxsnyz91nbYhiGIQAAAAAAalg9swsAAAAAANRNBFIAAAAAgCkIpAAAAAAAUxBIAQAAAACmIJACAAAAAExBIAUAAAAAmIJACgAAAAAwBYEUAAAAAGAKAikAAAAAwBQEUlzREhMTZbFY7Juzs7OaNWumhx56SD/++ONleU+LxaLY2Fj76y+++EKxsbH6/vvvS/UdPXq0rr/++stSx8XExsY6fC7XXHONmjVrpn79+ikhIUGnTp26YmqtCT169FCPHj2qZazff64Wi0UNGzZUWFiY1q5dW+UxN27c6PBnCgBQdef/b7B3794y90dGRjrMd9dff71Gjx5dqfdISUlRbGysTp48WfVCAVQIgRRXhZUrVyo1NVXJycl6+OGHtXbtWnXr1k2nT5+u9vdKTU3V2LFj7a+/+OILzZo1q8xA+tRTT2nDhg3VXkNFbdq0Sampqdq0aZNeeOEFNW/eXNOmTdPNN9+s/fv3O/Q1u9bLacmSJVqyZEm1jXffffcpNTVVKSkpeumll5SXl6cRI0ZozZo1VRpv48aNmjVrVrXVBwCouA0bNuipp56q1DEpKSmaNWsWgRSoAc5mFwBURFBQkDp37ixJ6tmzp0pKSvSPf/xD77zzjv785z9X63t16dKlwn1vvPHGan3vygoODpa3t7f99fDhwzVx4kR1795dgwYN0jfffCOr1SrJ/Fovp3bt2lXreD4+PvY/B6Ghoeratauuv/56vfzyyxoxYkS1vhcA4PLq1KmT2SVUWnFxsf3OMKC24woprkrnw8IPP/wgSfrtt98UExOjFi1ayNXVVdddd50mTJhQ6ieb27ZtU48ePeTl5SV3d3c1b95c9957r3799Vd7n9/fspuYmKg//elPks4F4fO3cSYmJkoqfRtsp06d1K1bt1L1lpSU6LrrrtOQIUPsbUVFRXr22WfVpk0bWa1WNWnSRA899JCOHTv2hz6bjh07asaMGTpy5IiSkpLs7WXdsrt48WLdeeedatq0qerXr6/27dtr7ty5Ki4uduhnGIZmz56twMBAubm5qXPnzkpOTi51q+z27dtlsVi0du1azZgxQ/7+/mrYsKH69Omjr7/+ulStK1asUMeOHeXm5qbGjRtr8ODB+vLLLx36fPfddxo+fLj8/f1ltVrl4+Oj3r17KyMjw96nrFt2ly5dqo4dO6pBgwby8PBQmzZtNH369Mp9mP9fYGCgmjRpop9//tmhPSkpSeHh4fLz85O7u7vatm2rv/3tbw5X7kePHq3FixdLcrwd+PwVd8MwtGTJEt1yyy1yd3dXo0aNdN999+m7776rUq0AAEcX3rJ79uxZPfvss2rdurXc3d117bXXqkOHDlq4cKGkc1+L+etf/ypJatGihf3f7e3bt9uPnzt3rn3+btq0qUaNGqWjR486vG9l587XXntNU6ZM0XXXXSer1apvv/1Wx44d0/jx49WuXTs1aNBATZs2Va9evbRr1y6H9/r+++9lsVj0/PPPa86cObr++uvl7u6uHj166JtvvlFxcbH+9re/yd/fX56enho8eLCys7Or/8MGqoAfu+Cq9O2330qSmjRpIsMwdM8992jr1q2KiYlRt27ddODAAc2cOVOpqalKTU2V1WrV999/r7vuukvdunXTihUrdO211+rHH3/Upk2bVFRUpGuuuabU+9x1112aPXu2pk+frsWLF+vWW2+VVP7VxoceekiTJ0/WoUOH1LJlS3v75s2b9dNPP+mhhx6SdG4yu/vuu7Vr1y5NmzZNYWFh+uGHHzRz5kz16NFDe/fulbu7e5U/n0GDBmnatGnauXOnRo0aVW6///3vfxoxYoQ9yO/fv1/PPfecvvrqK61YscLeb8aMGYqLi9MjjzyiIUOGKDMzU2PHjlVxcbFatWpVatzp06era9eu+te//qW8vDw9+eSTGjhwoL788ks5OTlJkuLi4jR9+nTdf//9iouL04kTJxQbG6vQ0FClpaXZP78BAwaopKREc+fOVfPmzXX8+HGlpKRc9DaqdevWafz48Zo0aZJeeOEF1atXT99++62++OKLKn2eubm5+uWXX0pdPT906JAGDBigqKgo1a9fX1999ZXmzJmj//73v9q2bZukc7dKnz59Wm+99ZZSU1Ptx/r5+UmSHn30USUmJurxxx/XnDlz9Msvv+iZZ55RWFiY9u/fLx8fnyrVDAC1XUlJic6cOVOq3TCMix43d+5cxcbG6u9//7vuvPNOFRcX66uvvrLPK2PHjtUvv/yihIQEvf322/Z/r8/fjfPYY49p2bJlmjhxoiIjI/X999/rqaee0vbt2/Xpp5/a71yq7NwZExOj0NBQvfTSS6pXr56aNm1q/yH1zJkz5evrq/z8fG3YsEE9evTQ1q1bS/0wdvHixerQoYMWL16skydPasqUKRo4cKBCQkLk4uKiFStW6IcfftDUqVM1duxYvffee5X6zIHLwgCuYCtXrjQkGXv27DGKi4uNU6dOGe+//77RpEkTw8PDw7DZbMamTZsMScbcuXMdjk1KSjIkGcuWLTMMwzDeeustQ5KRkZFx0feUZMycOdP++s033zQkGR9//HGpvg8++KARGBhof338+HHD1dXVmD59ukO/oUOHGj4+PkZxcbFhGIaxdu1aQ5Kxfv16h35paWmGJGPJkiUXrXHmzJmGJOPYsWNl7i8oKDAkGREREeXWeqGSkhKjuLjYWLVqleHk5GT88ssvhmEYxi+//GJYrVZj2LBhDv1TU1MNSUb37t3tbR9//LEhyRgwYIBD3zfeeMOQZKSmphqGYRg5OTmGu7t7qX5HjhwxrFarMWLECMMwzn2ekoz4+PiLfh7du3d3qGPixInGtddee9FjyiPJGD9+vFFcXGwUFRUZ33zzjTFo0CDDw8PD2Lt3b7nHnT171iguLjZ27NhhSDL2799v3zdhwgSjrH9uz3+G8+bNc2jPzMw03N3djWnTplXpHACgNjv/f4OLbb+f7wIDA40HH3zQ/joyMtK45ZZbLvoezz//vCHJOHz4sEP7l19+aZ8nfu+TTz4xJNnn/6rMnXfeeeclz/3MmTNGcXGx0bt3b2Pw4MH29sOHDxuSjI4dOxolJSX29vj4eEOSMWjQIIdxoqKiDElGbm7uJd8TuNy4ZRdXhS5dusjFxUUeHh6KjIyUr6+vPvzwQ/n4+NivRF24gt6f/vQn1a9fX1u3bpUk3XLLLXJ1ddUjjzyiV1999bLcEunl5aWBAwfq1Vdf1dmzZyVJOTk5evfddzVq1Cj7d0Hef/99XXvttRo4cKDOnDlj32655Rb5+vrabwuqKuMSPx0+b9++fRo0aJC8vLzk5OQkFxcXjRo1SiUlJfrmm28kSXv27FFhYaGGDh3qcGyXLl3KXbV30KBBDq87dOgg6f9usU5NTVVBQUGp37OAgAD16tXL/nvWuHFj3XjjjXr++ec1f/587du3z/65Xsztt9+ukydP6v7779e7776r48ePX/KY31uyZIlcXFzk6uqqVq1a6cMPP9TatWsVHBzs0O+7777TiBEj5Ovra//8unfvLkmlbj0uy/vvvy+LxaIHHnjA4c+Br6+vOnbs+If/HABAbbZq1SqlpaWV2u64446LHnf77bdr//79Gj9+vD766CPl5eVV+D0//vhjSaX/z3H77berbdu29vmrKnPnvffeW2b7Sy+9pFtvvVVubm5ydnaWi4uLtm7dWuY8M2DAANWr93//vW/btq2kc3d8/d759iNHjpRzpkDNIZDiqnB+0tm3b59++uknHThwQF27dpUknThxQs7OzmrSpInDMRaLRb6+vjpx4oSkc7fZbtmyRU2bNtWECRN044036sYbb7R/Z6S6/OUvf9GPP/6o5ORkSdLatWtVWFjoMHn9/PPPOnnypFxdXeXi4uKw2Wy2SgeoC50Pfv7+/uX2OXLkiLp166Yff/xRCxcu1K5du5SWlmb/vmNBQYEk2T+/sm4dLe92Ui8vL4fX5xdWunDM87dB/Z6/v799v8Vi0datW9WvXz/NnTtXt956q5o0aaLHH3+8zEfbnDdy5Ej7bUn33nuvmjZtqpCQEPvvyaUMHTpUaWlpSklJ0csvvywPDw8NHz5chw4dsvfJz89Xt27d9Mknn+jZZ5/V9u3blZaWprffftvhXC/m559/lmEY8vHxKfXnYM+ePX/4zwEA1GZt27ZV586dS22enp4XPS4mJkYvvPCC9uzZo4iICHl5eal3797lPkbm9yo6f1Vl7ixrzPnz5+uxxx5TSEiI1q9frz179igtLU39+/cvc55p3Lixw2tXV9eLtv/2229l1gLUJL5DiqvC+UmnLF5eXjpz5oyOHTvmEEoNw5DNZtNtt91mb+vWrZu6deumkpIS7d27VwkJCYqKipKPj4+GDx9eLbX269dP/v7+Wrlypfr166eVK1cqJCTEYSVYb29veXl5adOmTWWO4eHh8YdqOP+dkIs9m/Odd97R6dOn9fbbbyswMNDe/vvFgqT/C5cXLugjSTabrUrPNj0/ZlZWVql9P/30k8PKwYGBgVq+fLkk6ZtvvtEbb7yh2NhYFRUV6aWXXir3PR566CE99NBDOn36tHbu3KmZM2cqMjJS33zzjcP5lqVJkyb2P2+hoaFq27atunfvrieeeELvv/++pHMLZP3000/avn27/aqopEo9IsDb21sWi0W7du2yh/bfK6sNAPDHODs7Kzo6WtHR0Tp58qS2bNmi6dOnq1+/fsrMzCxzTYnzfj9/NWvWzGHf7+evqsydFoulVNvq1avVo0cPLV261KH9Yj+UBa42XCHFVa93796Szv2j/Xvr16/X6dOn7ft/z8nJSSEhIfargZ9++mm54194de9SnJycNHLkSL3zzjvatWuX9u7dq7/85S8OfSIjI3XixAmVlJSU+dPd1q1bV+i9yrJ//37Nnj1b119/falbhX7v/MT3+9BjGIZeeeUVh34hISGyWq0OK/ZK525HOn8ltrJCQ0Pl7u5e6vfs6NGj2rZtW5m/Z5LUqlUr/f3vf1f79u0v+nv2e/Xr11dERIRmzJihoqIiff7555Wut1u3bho1apQ++OAD+8JEZX1+kvTyyy+XOr68P0ORkZEyDEM//vhjmX8O2rdvX+laAQAVd+211+q+++7ThAkT9Msvv9hXQC/v3+1evXpJKv1/jrS0NH355Zf2+au65k6LxVJqnjlw4IDDInnA1Y4rpLjq9e3bV/369dOTTz6pvLw8de3a1b7KbqdOnTRy5EhJ576DsW3bNt11111q3ry5fvvtN/tKsn369Cl3/KCgIEnSsmXL5OHhITc3N7Vo0aLUbam/95e//EVz5szRiBEj5O7urmHDhjnsHz58uF5//XUNGDBAkydP1u233y4XFxcdPXpUH3/8se6++24NHjz4kueenp4uT09PFRcX66efftLWrVv12muvqWnTpvr3v/9tvyWnvM/N1dVV999/v6ZNm6bffvtNS5cuVU5OjkO/xo0bKzo6WnFxcWrUqJEGDx6so0ePatasWfLz83P4rkpFXXvttXrqqac0ffp0jRo1Svfff79OnDihWbNmyc3NTTNnzpR0btKdOHGi/vSnP6lly5ZydXXVtm3bdODAAf3tb38rd/yHH35Y7u7u6tq1q/z8/GSz2RQXFydPT0+HK+aV8Y9//ENJSUl66qmntGXLFoWFhalRo0YaN26cZs6cKRcXF73++uvav39/qWPPB8s5c+YoIiJCTk5O6tChg7p27apHHnlEDz30kPbu3as777xT9evXV1ZWlnbv3q327dvrscceq1K9AICyDRw40P588yZNmuiHH35QfHy8AgMD7Su8n/93e+HChXrwwQfl4uKi1q1bq3Xr1nrkkUeUkJCgevXqKSIiwr7KbkBAgJ544glJ1Td3RkZG6h//+Idmzpyp7t276+uvv9YzzzyjFi1alLnCMHBVMnVJJeASzq+kl5aWdtF+BQUFxpNPPmkEBgYaLi4uhp+fn/HYY48ZOTk59j6pqanG4MGDjcDAQMNqtRpeXl5G9+7djffee89hLF2wyq5hnFulrkWLFoaTk5MhyVi5cqVhGBdfuTYsLMyQZPz5z38uc39xcbHxwgsvGB07djTc3NyMBg0aGG3atDEeffRR49ChQxc93/Or7J7frFar4efnZ4SHhxsLFy408vLySh1TVq3//ve/7e9/3XXXGX/961+NDz/8sNSqwmfPnjWeffZZo1mzZoarq6vRoUMH4/333zc6duzosMrf+ZUC33zzTYf3Ob/63/nP7bx//etfRocOHQxXV1fD09PTuPvuu43PP//cvv/nn382Ro8ebbRp08aoX7++0aBBA6NDhw7GggULjDNnztj7XbjK7quvvmr07NnT8PHxMVxdXQ1/f39j6NChxoEDBy76uRrGud//CRMmlLnvr3/9qyHJ2LFjh2EYhpGSkmKEhoYa11xzjdGkSRNj7NixxqefflrqXAsLC42xY8caTZo0MSwWS6mVG1esWGGEhIQY9evXN9zd3Y0bb7zRGDVq1EVX9QWAuupS/ze46667LrrK7rx584ywsDDD29vbcHV1NZo3b26MGTPG+P777x3GiYmJMfz9/Y169eo5zIslJSXGnDlzjFatWhkuLi6Gt7e38cADDxiZmZkOx//RudMwzs0fU6dONa677jrDzc3NuPXWW4133nmn1Jx+fp59/vnnHY4vb+yK/v8KqAkWw6jgcpwA8DuHDx9WmzZtNHPmTE2fPt3scgAAuOIxdwKlEUgBXNL+/fu1du1ahYWFqWHDhvr66681d+5c5eXl6bPPPit3xUAAAOoq5k6gYvgOKYBLql+/vvbu3avly5fr5MmT8vT0VI8ePfTcc88xoQIAUAbmTqBiuEIKAAAAADAFj30BAAAAAJiCQAoAAAAAMAWBFAAAAABgChY1KsPZs2f1008/ycPDQxaLxexyAAAmMQxDp06dkr+/f4UfZF/XMYcCAKRKzKHmPQL1nMWLFxvXX3+9YbVajVtvvdXYuXNnuX3Xr19v9OnTx/D29jY8PDyMLl26GJs2bXLoc/5BvxduBQUFFa4pMzOzzDHY2NjY2OrmduED71E+5lA2NjY2tt9vl5pDTb1CmpSUpKioKC1ZskRdu3bVyy+/rIiICH3xxRdq3rx5qf47d+5U3759NXv2bF177bVauXKlBg4cqE8++USdOnWy9zv/rKffc3Nzq3BdHh4ekqTMzEw1bNiwimcHALja5eXlKSAgwD4v4NKYQwEAUsXnUFMf+xISEqJbb71VS5cutbe1bdtW99xzj+Li4io0xs0336xhw4bp6aefliQlJiYqKipKJ0+erHJdeXl58vT0VG5uLpMpANRhzAeVx2cGAJAqPh+Y9oWYoqIipaenKzw83KE9PDxcKSkpFRrj7NmzOnXqlBo3buzQnp+fr8DAQDVr1kyRkZHat2/fRccpLCxUXl6ewwYAAAAAuLxMC6THjx9XSUmJfHx8HNp9fHxks9kqNMa8efN0+vRpDR061N7Wpk0bJSYm6r333tPatWvl5uamrl276tChQ+WOExcXJ09PT/sWEBBQtZMCAAAAAFSY6UsGXrgCn2EYFVqVb+3atYqNjVVSUpKaNm1qb+/SpYseeOABdezYUd26ddMbb7yhVq1aKSEhodyxYmJilJuba98yMzOrfkIAAAAAgAoxbVEjb29vOTk5lboamp2dXeqq6YWSkpI0ZswYvfnmm+rTp89F+9arV0+33XbbRa+QWq1WWa3WihcPAAAAAPjDTLtC6urqquDgYCUnJzu0JycnKywsrNzj1q5dq9GjR2vNmjW66667Lvk+hmEoIyNDfn5+f7hmAAAAAED1MfWxL9HR0Ro5cqQ6d+6s0NBQLVu2TEeOHNG4ceMknbuV9scff9SqVasknQujo0aN0sKFC9WlSxf71VV3d3d5enpKkmbNmqUuXbqoZcuWysvL04svvqiMjAwtXrzYnJMEAAAAAJTJ1EA6bNgwnThxQs8884yysrIUFBSkjRs3KjAwUJKUlZWlI0eO2Pu//PLLOnPmjCZMmKAJEybY2x988EElJiZKkk6ePKlHHnlENptNnp6e6tSpk3bu3Knbb7+9Rs8NAAAAAHBxpj6H9ErFM9QAANLVPR/ExcVp+vTpmjx5suLj4yWd+xrLrFmztGzZMuXk5CgkJESLFy/WzTffbD+usLBQU6dO1dq1a1VQUKDevXtryZIlatasWYXe92r+zAAA1eeKfw4pAAC4PNLS0rRs2TJ16NDBoX3u3LmaP3++Fi1apLS0NPn6+qpv3746deqUvU9UVJQ2bNigdevWaffu3crPz1dkZKRKSkpq+jQAAHUAgRQAgFokPz9ff/7zn/XKK6+oUaNG9nbDMBQfH68ZM2ZoyJAhCgoK0quvvqpff/1Va9askSTl5uZq+fLlmjdvnvr06aNOnTpp9erVOnjwoLZs2VLm+xUWFiovL89hAwCgogikAADUIhMmTNBdd91V6rFohw8fls1mU3h4uL3NarWqe/fuSklJkSSlp6eruLjYoY+/v7+CgoLsfS4UFxcnT09P+xYQEHAZzgoAUFsRSAEAqCXWrVunTz/9VHFxcaX2nV+Z/sJnffv4+Nj32Ww2ubq6OlxZvbDPhWJiYpSbm2vfMjMzq+NUAAB1hKmr7AIAgOqRmZmpyZMna/PmzXJzcyu3n8VicXhtGEaptgtdrI/VapXVaq18wQAAiEAKAFekyZMn69ixY5KkJk2aaOHChSZXhCtdenq6srOzFRwcbG8rKSnRzp07tWjRIn399deSzl0F9fPzs/fJzs62XzX19fVVUVGRcnJyHK6SZmdnKywsrIbOBABQl3DLLgBcgY4dO6aff/5ZP//8sz2YAhfTu3dvHTx4UBkZGfatc+fO+vOf/6yMjAzdcMMN8vX1VXJysv2YoqIi7dixwx42g4OD5eLi4tAnKytLn332GYEUAHBZcIUUAIBawMPDQ0FBQQ5t9evXl5eXl709KipKs2fPVsuWLdWyZUvNnj1b11xzjUaMGCFJ8vT01JgxYzRlyhR5eXmpcePGmjp1qtq3b19qkSQANefIM+3NLgF1RPOnD9b4exJIAQCoI6ZNm6aCggKNHz9eOTk5CgkJ0ebNm+Xh4WHvs2DBAjk7O2vo0KEqKChQ7969lZiYKCcnJxMrBwDUVgRSAABqqe3btzu8tlgsio2NVWxsbLnHuLm5KSEhQQkJCZe3OAAAxHdIAQAAAAAmIZACAAAAAExBIAUAAAAAmIJACgAAAAAwBYEUAAAAAGAKAikAAAAAwBQEUgAAAACAKQikAAAAAABTEEgBAAAAAKYgkAIAAAAATEEgBQAAAACYgkAKAAAAADAFgRQAAAAAYAoCKQAAAADAFARSAAAAAIApCKQAAAAAAFMQSAEAAAAApiCQAgAAAABMQSAFAAAAAJiCQAoAAAAAMAWBFAAAAABgCgIpAAAAAMAUBFIAAAAAgCkIpAAAAAAAUxBIAQAAAACmIJACAAAAAExBIAUAAAAAmIJACgAAAAAwBYEUAAAAAGAKAikAAAAAwBQEUgAAAACAKQikAAAAAABTEEgBAAAAAKYgkAIAAAAATEEgBQAAAACYgkAKAEAtsHTpUnXo0EENGzZUw4YNFRoaqg8//NC+f/To0bJYLA5bly5dHMYoLCzUpEmT5O3trfr162vQoEE6evRoTZ8KAKAOIZACAFALNGvWTP/85z+1d+9e7d27V7169dLdd9+tzz//3N6nf//+ysrKsm8bN250GCMqKkobNmzQunXrtHv3buXn5ysyMlIlJSU1fToAgDrC2ewCAADAHzdw4ECH188995yWLl2qPXv26Oabb5YkWa1W+fr6lnl8bm6uli9frtdee019+vSRJK1evVoBAQHasmWL+vXrd3lPAABQJ3GFFACAWqakpETr1q3T6dOnFRoaam/fvn27mjZtqlatWunhhx9Wdna2fV96erqKi4sVHh5ub/P391dQUJBSUlLKfa/CwkLl5eU5bAAAVBSBFACAWuLgwYNq0KCBrFarxo0bpw0bNqhdu3aSpIiICL3++uvatm2b5s2bp7S0NPXq1UuFhYWSJJvNJldXVzVq1MhhTB8fH9lstnLfMy4uTp6envYtICDg8p0gAKDW4ZZdAABqidatWysjI0MnT57U+vXr9eCDD2rHjh1q166dhg0bZu8XFBSkzp07KzAwUB988IGGDBlS7piGYchisZS7PyYmRtHR0fbXeXl5hFIAQIURSAEAqCVcXV110003SZI6d+6stLQ0LVy4UC+//HKpvn5+fgoMDNShQ4ckSb6+vioqKlJOTo7DVdLs7GyFhYWV+55Wq1VWq7WazwQAUFdwyy4AALWUYRj2W3IvdOLECWVmZsrPz0+SFBwcLBcXFyUnJ9v7ZGVl6bPPPrtoIAUA4I/gCikAALXA9OnTFRERoYCAAJ06dUrr1q3T9u3btWnTJuXn5ys2Nlb33nuv/Pz89P3332v69Ony9vbW4MGDJUmenp4aM2aMpkyZIi8vLzVu3FhTp05V+/bt7avuAgBQ3QikAADUAj///LNGjhyprKwseXp6qkOHDtq0aZP69u2rgoICHTx4UKtWrdLJkyfl5+ennj17KikpSR4eHvYxFixYIGdnZw0dOlQFBQXq3bu3EhMT5eTkZOKZAQBqMwIpAAC1wPLly8vd5+7uro8++uiSY7i5uSkhIUEJCQnVWRoAAOXiO6QAAAAAAFMQSAEAAAAApiCQAgAAAABMQSAFAAAAAJiCQAoAAAAAMAWBFAAAAABgCgIpAAAAAMAUBFIAAAAAgCkIpAAAAAAAUzibXcCSJUv0/PPPKysrSzfffLPi4+PVrVu3Mvu+/fbbWrp0qTIyMlRYWKibb75ZsbGx6tevn0O/9evX66mnntL//vc/3XjjjXruuec0ePDgmjgdAOU48kx7s0u4qpw56SXJ6f//+ic+v0po/vRBs0sAAAAVZOoV0qSkJEVFRWnGjBnat2+funXrpoiICB05cqTM/jt37lTfvn21ceNGpaenq2fPnho4cKD27dtn75Oamqphw4Zp5MiR2r9/v0aOHKmhQ4fqk08+qanTAgAAAABUgMUwDMOsNw8JCdGtt96qpUuX2tvatm2re+65R3FxcRUa4+abb9awYcP09NNPS5KGDRumvLw8ffjhh/Y+/fv3V6NGjbR27doKjZmXlydPT0/l5uaqYcOGlTgjAOXhCl/lTN3jpROF566QellL9EKXEyZXdPWoziukzAeVx2cGVD/mUNQUM+ZQ066QFhUVKT09XeHh4Q7t4eHhSklJqdAYZ8+e1alTp9S4cWN7W2pqaqkx+/Xrd9ExCwsLlZeX57ABAAAAAC4v0wLp8ePHVVJSIh8fH4d2Hx8f2Wy2Co0xb948nT59WkOHDrW32Wy2So8ZFxcnT09P+xYQEFCJMwEAAAAAVIXpq+xaLBaH14ZhlGory9q1axUbG6ukpCQ1bdr0D40ZExOj3Nxc+5aZmVmJMwAAAAAAVIVpq+x6e3vLycmp1JXL7OzsUlc4L5SUlKQxY8bozTffVJ8+fRz2+fr6VnpMq9Uqq9VayTMAAAAAAPwRpl0hdXV1VXBwsJKTkx3ak5OTFRYWVu5xa9eu1ejRo7VmzRrdddddpfaHhoaWGnPz5s0XHRMAAAAAUPNMfQ5pdHS0Ro4cqc6dOys0NFTLli3TkSNHNG7cOEnnbqX98ccftWrVKknnwuioUaO0cOFCdenSxX4l1N3dXZ6enpKkyZMn684779ScOXN09913691339WWLVu0e/duc04SAAAAAFAmU79DOmzYMMXHx+uZZ57RLbfcop07d2rjxo0KDAyUJGVlZTk8k/Tll1/WmTNnNGHCBPn5+dm3yZMn2/uEhYVp3bp1WrlypTp06KDExEQlJSUpJCSkxs8PAAAAAFA+U6+QStL48eM1fvz4MvclJiY6vN6+fXuFxrzvvvt03333/cHKAAAAAACXk+mr7AIAAAAA6iYCKQAAAADAFARSAAAAAIApCKQAAAAAAFMQSAEAAAAApiCQAgAAAABMQSAFAAAAAJiCQAoAAAAAMAWBFAAAAABgCgIpAAAAAMAUBFIAAAAAgCkIpAAAAAAAUxBIAQAAAACmIJACAAAAAExBIAUAAAAAmIJACgBALbB06VJ16NBBDRs2VMOGDRUaGqoPP/zQvt8wDMXGxsrf31/u7u7q0aOHPv/8c4cxCgsLNWnSJHl7e6t+/foaNGiQjh49WtOnAgCoQwikAADUAs2aNdM///lP7d27V3v37lWvXr10991320Pn3LlzNX/+fC1atEhpaWny9fVV3759derUKfsYUVFR2rBhg9atW6fdu3crPz9fkZGRKikpMeu0AAC1HIEUAIBaYODAgRowYIBatWqlVq1a6bnnnlODBg20Z88eGYah+Ph4zZgxQ0OGDFFQUJBeffVV/frrr1qzZo0kKTc3V8uXL9e8efPUp08fderUSatXr9bBgwe1ZcsWk88OAFBbEUgBAKhlSkpKtG7dOp0+fVqhoaE6fPiwbDabwsPD7X2sVqu6d++ulJQUSVJ6erqKi4sd+vj7+ysoKMjepyyFhYXKy8tz2AAAqCgCKQAAtcTBgwfVoEEDWa1WjRs3Ths2bFC7du1ks9kkST4+Pg79fXx87PtsNptcXV3VqFGjcvuUJS4uTp6envYtICCgms8KAFCbEUgBAKglWrdurYyMDO3Zs0ePPfaYHnzwQX3xxRf2/RaLxaG/YRil2i50qT4xMTHKzc21b5mZmX/sJAAAdQqBFACAWsLV1VU33XSTOnfurLi4OHXs2FELFy6Ur6+vJJW60pmdnW2/aurr66uioiLl5OSU26csVqvVvrLv+Q0AgIoikAIAUEsZhqHCwkK1aNFCvr6+Sk5Otu8rKirSjh07FBYWJkkKDg6Wi4uLQ5+srCx99tln9j4AAFQ3Z7MLAAAAf9z06dMVERGhgIAAnTp1SuvWrdP27du1adMmWSwWRUVFafbs2WrZsqVatmyp2bNn65prrtGIESMkSZ6enhozZoymTJkiLy8vNW7cWFOnTlX79u3Vp08fk88OAFBbEUgBAKgFfv75Z40cOVJZWVny9PRUhw4dtGnTJvXt21eSNG3aNBUUFGj8+PHKyclRSEiINm/eLA8PD/sYCxYskLOzs4YOHaqCggL17t1biYmJcnJyMuu0AAC1HIEUAIBaYPny5Rfdb7FYFBsbq9jY2HL7uLm5KSEhQQkJCdVcHQAAZeM7pAAAAAAAUxBIAQAAAACmIJACAAAAAExBIAUAAAAAmIJACgAAAAAwBYEUAAAAAGAKAikAAAAAwBQEUgAAAACAKQikAAAAAABTEEgBAAAAAKYgkAIAAAAATEEgBQAAAACYgkAKAAAAADAFgRQAAAAAYAoCKQAAAADAFM5mFwBcaPLkyTp27JgkqUmTJlq4cKHJFQEAAAC4HAikuOIcO3ZMP//8s9llAAAAALjMuGUXAAAAAGAKAikAAAAAwBQEUgAAAACAKQikAAAAAABTEEgBAAAAAKYgkAIAAAAATEEgBQAAAACYgkAKAAAAADAFgRQAAAAAYApnswsAAJTW2FpS5q8BAABqEwIpAFyBpnc6aXYJAAAAlx237AIAAAAATEEgBQAAAACYgkAKAAAAADAFgRQAAAAAYAoCKQAAAADAFARSAAAAAIApCKQAAAAAAFMQSAEAAAAApiCQAgBQC8TFxem2226Th4eHmjZtqnvuuUdff/21Q5/Ro0fLYrE4bF26dHHoU1hYqEmTJsnb21v169fXoEGDdPTo0Zo8FQBAHUIgBQCgFtixY4cmTJigPXv2KDk5WWfOnFF4eLhOnz7t0K9///7Kysqybxs3bnTYHxUVpQ0bNmjdunXavXu38vPzFRkZqZKSkpo8HQBAHeFsdgEAAOCP27Rpk8PrlStXqmnTpkpPT9edd95pb7darfL19S1zjNzcXC1fvlyvvfaa+vTpI0lavXq1AgICtGXLFvXr1+/ynQAAoE4y/QrpkiVL1KJFC7m5uSk4OFi7du0qt29WVpZGjBih1q1bq169eoqKiirVJzExsdTtSBaLRb/99ttlPAsAAK4subm5kqTGjRs7tG/fvl1NmzZVq1at9PDDDys7O9u+Lz09XcXFxQoPD7e3+fv7KygoSCkpKWW+T2FhofLy8hw2AAAqytRAmpSUpKioKM2YMUP79u1Tt27dFBERoSNHjpTZv7CwUE2aNNGMGTPUsWPHcsdt2LChw+1IWVlZcnNzu1ynAQDAFcUwDEVHR+uOO+5QUFCQvT0iIkKvv/66tm3bpnnz5iktLU29evVSYWGhJMlms8nV1VWNGjVyGM/Hx0c2m63M94qLi5Onp6d9CwgIuHwnBgCodUy9ZXf+/PkaM2aMxo4dK0mKj4/XRx99pKVLlyouLq5U/+uvv14LFy6UJK1YsaLccS0WS7m3IwEAUNtNnDhRBw4c0O7dux3ahw0bZv91UFCQOnfurMDAQH3wwQcaMmRIueMZhiGLxVLmvpiYGEVHR9tf5+XlEUoBABVm2hXSoqIipaenO9wWJEnh4eHl3hZUUfn5+QoMDFSzZs0UGRmpffv2XbQ/txsBAGqLSZMm6b333tPHH3+sZs2aXbSvn5+fAgMDdejQIUmSr6+vioqKlJOT49AvOztbPj4+ZY5htVrVsGFDhw0AgIoyLZAeP35cJSUlpSa4i90WVBFt2rRRYmKi3nvvPa1du1Zubm7q2rWrfbItC7cbAQCudoZhaOLEiXr77be1bds2tWjR4pLHnDhxQpmZmfLz85MkBQcHy8XFRcnJyfY+WVlZ+uyzzxQWFnbZagcA1F2mL2p04S1AF7stqCK6dOmiBx54QB07dlS3bt30xhtvqFWrVkpISCj3mJiYGOXm5tq3zMzMKr8/AABmmDBhglavXq01a9bIw8NDNptNNptNBQUFks7dPTR16lSlpqbq+++/1/bt2zVw4EB5e3tr8ODBkiRPT0+NGTNGU6ZM0datW7Vv3z498MADat++vX3VXQAAqpNp3yH19vaWk5NTqauhF7stqCrq1aun22677aJXSK1Wq6xWa7W9JwAANW3p0qWSpB49eji0r1y5UqNHj5aTk5MOHjyoVatW6eTJk/Lz81PPnj2VlJQkDw8Pe/8FCxbI2dlZQ4cOVUFBgXr37q3ExEQ5OTnV5OkAAOoI0wKpq6urgoODlZycbP/JrCQlJyfr7rvvrrb3MQxDGRkZat++fbWNCQDAlcYwjIvud3d310cffXTJcdzc3JSQkHDRO4sAAKgupq6yGx0drZEjR6pz584KDQ3VsmXLdOTIEY0bN07SuVtpf/zxR61atcp+TEZGhqRztx4dO3ZMGRkZcnV1Vbt27SRJs2bNUpcuXdSyZUvl5eXpxRdfVEZGhhYvXlzj5wcAAAAAKJ+pgXTYsGE6ceKEnnnmGWVlZSkoKEgbN25UYGCgpHMLKVz4TNJOnTrZf52enq41a9YoMDBQ33//vSTp5MmTeuSRR2Sz2eTp6alOnTpp586duv3222vsvAAAAAAAl2ZqIJWk8ePHa/z48WXuS0xMLNV2qVuSFixYoAULFlRHaQAAAACAy8j0VXYBAAAAAHUTgRQAAAAAYAoCKQAAAADAFARSAAAAAIApCKQAAAAAAFMQSAEAAAAApiCQAgAAAABMQSAFAAAAAJiCQAoAAAAAMAWBFAAAAABgCgIpAAAAAMAUBFIAAAAAgCkIpAAAmKhXr146efJkqfa8vDz16tWr5gsCAKAGEUgBADDR9u3bVVRUVKr9t99+065du0yoCACAmuNclYM2bdqkBg0a6I477pAkLV68WK+88oratWunxYsXq1GjRtVaJAAAtc2BAwfsv/7iiy9ks9nsr0tKSrRp0yZdd911ZpQGAECNqVIg/etf/6o5c+ZIkg4ePKgpU6YoOjpa27ZtU3R0tFauXFmtRQIAUNvccsstslgsslgsZd6a6+7uroSEBBMqAwCg5lQpkB4+fFjt2rWTJK1fv16RkZGaPXu2Pv30Uw0YMKBaCwQAoDY6fPiwDMPQDTfcoP/+979q0qSJfZ+rq6uaNm0qJycnEyu8cgT/dZXZJaCOSH9+lNklAHVOlQKpq6urfv31V0nSli1bNGrUub+8jRs3Vl5eXvVVBwBALRUYGChJOnv2rMmVAABgnioF0jvuuEPR0dHq2rWr/vvf/yopKUmS9M0336hZs2bVWiAAALXdN998o+3btys7O7tUQH366adNqgoAgMuvSoF00aJFGj9+vN566y0tXbrUvujChx9+qP79+1drgQAA1GavvPKKHnvsMXl7e8vX11cWi8W+z2KxEEgBALValQJp8+bN9f7775dqX7BgwR8uCACAuuTZZ5/Vc889pyeffNLsUgAAqHFVeg7pp59+qoMHD9pfv/vuu7rnnns0ffr0Mp+lBgAAypaTk6M//elPZpcBAIApqhRIH330UX3zzTeSpO+++07Dhw/XNddcozfffFPTpk2r1gIBAKjN/vSnP2nz5s1mlwEAgCmqdMvuN998o1tuuUWS9Oabb+rOO+/UmjVr9J///EfDhw9XfHx8NZYIAEDtddNNN+mpp57Snj171L59e7m4uDjsf/zxx02qDACAy69KgdQwDPsqgFu2bFFkZKQkKSAgQMePH6++6gAAqOWWLVumBg0aaMeOHdqxY4fDPovFQiAFANRqVQqknTt31rPPPqs+ffpox44dWrp0qaRzD/n28fGp1gIBAKjNDh8+bHYJAACYpkrfIY2Pj9enn36qiRMnasaMGbrpppskSW+99ZbCwsKqtUAAAAAAQO1UpSukHTp0cFhl97znn39eTk5Of7goAADqir/85S8X3b9ixYoaqgQAgJpXpUBaHjc3t+ocDgCAWi8nJ8fhdXFxsT777DOdPHlSvXr1MqkqAABqRpUCaUlJiRYsWKA33nhDR44cKfXs0V9++aVaiqtNgv+6yuwSrhoNc/Lt95Jn5eTz2VVS+vOjzC4BQCVs2LChVNvZs2c1fvx43XDDDSZUBABAzanSd0hnzZql+fPna+jQocrNzVV0dLSGDBmievXqKTY2tppLBACgbqlXr56eeOIJLViwwOxSAAC4rKoUSF9//XW98sormjp1qpydnXX//ffrX//6l55++mnt2bOnumsEAKDO+d///qczZ86YXQYAAJdVlQKpzWZT+/btJUkNGjRQbm6uJCkyMlIffPBB9VUHAEAtFx0d7bA98cQTGj58uIYNG6Zhw4ZVeJy4uDjddttt8vDwUNOmTXXPPffo66+/duhjGIZiY2Pl7+8vd3d39ejRQ59//rlDn8LCQk2aNEne3t6qX7++Bg0apKNHj1bLuQIAcKEqBdJmzZopKytLknTTTTdp8+bNkqS0tDRZrdbqqw4AgFpu3759DtuBAwckSfPmzVN8fHyFx9mxY4cmTJigPXv2KDk5WWfOnFF4eLhOnz5t7zN37lzNnz9fixYtUlpamnx9fdW3b1+dOnXK3icqKkobNmzQunXrtHv3buXn5ysyMlIlJSXVds4AAJxXpUWNBg8erK1btyokJESTJ0/W/fffr+XLl+vIkSN64oknqrtGAABqrY8//rhaxtm0aZPD65UrV6pp06ZKT0/XnXfeKcMwFB8frxkzZmjIkCGSpFdffVU+Pj5as2aNHn30UeXm5mr58uV67bXX1KdPH0nS6tWrFRAQoC1btqhfv37VUisAAOdVKZD+85//tP/6vvvuU7NmzZSSkqKbbrpJgwYNqrbiAACoK44dO6avv/5aFotFrVq1UpMmTf7QeOe/TtO4cWNJ0uHDh2Wz2RQeHm7vY7Va1b17d6WkpOjRRx9Venq6iouLHfr4+/srKChIKSkpZQbSwsJCFRYW2l/n5eX9oboBAHVLtTyHtEuXLurSpUt1DAUAQJ1y+vRpTZo0SatWrdLZs2clSU5OTho1apQSEhJ0zTXXVHpMwzAUHR2tO+64Q0FBQZLOrf8gST4+Pg59fXx89MMPP9j7uLq6qlGjRqX6nD/+QnFxcZo1a1alawQAQKpEIH3vvfcqPChXSQEAqJjo6Gjt2LFD//73v9W1a1dJ0u7du/X4449rypQpWrp0aaXHnDhxog4cOKDdu3eX2mexWBxeG4ZRqu1CF+sTExOj6Oho++u8vDwFBARUumYAQN1U4UB6zz33VKifxWJh4QMAACpo/fr1euutt9SjRw9724ABA+Tu7q6hQ4dWOpBOmjRJ7733nnbu3KlmzZrZ2319fSWduwrq5+dnb8/OzrZfNfX19VVRUZFycnIcrpJmZ2crLCyszPezWq0saAgAqLIKr7J79uzZCm2EUQAAKu7XX38tdRutJDVt2lS//vprhccxDEMTJ07U22+/rW3btqlFixYO+1u0aCFfX18lJyfb24qKirRjxw572AwODpaLi4tDn6ysLH322WflBlIAAP6ISj32Zdu2bWrXrl2ZCxbk5ubq5ptv1q5du6qtOAAAarvQ0FDNnDlTv/32m72toKBAs2bNUmhoaIXHmTBhglavXq01a9bIw8NDNptNNptNBQUFks7dwRQVFaXZs2drw4YN+uyzzzR69Ghdc801GjFihCTJ09NTY8aM0ZQpU7R161bt27dPDzzwgNq3b29fdRcAgOpUqUWN4uPj9fDDD6thw4al9nl6eurRRx/V/Pnz1a1bt2orEACA2iw+Pl4RERFq1qyZOnbsKIvFooyMDFmtVvtzvivi/K29v7/1Vzr3+JfRo0dLkqZNm6aCggKNHz9eOTk5CgkJ0ebNm+Xh4WHvv2DBAjk7O2vo0KEqKChQ7969lZiYKCcnpz98rgAAXKhSgXT//v2aM2dOufvDw8P1wgsv/OGiAACoK9q3b69Dhw5p9erV+uqrr2QYhoYPH64///nPcnd3r/A4hmFcso/FYlFsbKxiY2PL7ePm5qaEhAQlJCRU+L0BAKiqSgXSn3/+WS4uLuUP5uysY8eO/eGiAACoK+Li4uTj46OHH37YoX3FihU6duyYnnzySZMqAwDg8qvUd0ivu+46HTx4sNz9Bw4ccFi5DwAAXNzLL7+sNm3alGq/+eab9dJLL5lQEQAANadSgXTAgAF6+umnHRZeOK+goEAzZ85UZGRktRUHAEBtd+FjWM5r0qSJsrKyTKgIAICaU6lbdv/+97/r7bffVqtWrTRx4kS1bt1aFotFX375pRYvXqySkhLNmDHjctUKAECtExAQoP/85z+lHtPyn//8R/7+/iZVBQBAzahUIPXx8VFKSooee+wxxcTE2BdQsFgs6tevn5YsWVLms9QAAEDZxo4dq6ioKBUXF6tXr16SpK1bt2ratGmaMmWKydUBAHB5VSqQSlJgYKA2btyonJwcffvttzIMQy1btlSjRo0uR30AANRq06ZN0y+//KLx48erqKhI0rmVbp988knFxMSYXB0AAJdXpQPpeY0aNdJtt91WnbUAAFDnWCwWzZkzR0899ZS+/PJLubu7q2XLlrJarWaXBgDAZVflQAoAAKpPgwYN+EEvAKDOqdQquwAAAAAAVBcCKQAAAADAFARSAAAAAIApCKQAAAAAAFMQSAEAAAAApiCQAgAAAABMQSAFAAAAAJiCQAoAAAAAMAWBFAAAAABgCgIpAAAAAMAUBFIAAAAAgCkIpAAAAAAAUxBIAQAAAACmIJACAAAAAExheiBdsmSJWrRoITc3NwUHB2vXrl3l9s3KytKIESPUunVr1atXT1FRUWX2W79+vdq1ayer1ap27dppw4YNl6l6AAAAAEBVmRpIk5KSFBUVpRkzZmjfvn3q1q2bIiIidOTIkTL7FxYWqkmTJpoxY4Y6duxYZp/U1FQNGzZMI0eO1P79+zVy5EgNHTpUn3zyyeU8FQAAAABAJZkaSOfPn68xY8Zo7Nixatu2reLj4xUQEKClS5eW2f/666/XwoULNWrUKHl6epbZJz4+Xn379lVMTIzatGmjmJgY9e7dW/Hx8ZfxTAAAAAAAlWVaIC0qKlJ6errCw8Md2sPDw5WSklLlcVNTU0uN2a9fv4uOWVhYqLy8PIcNAAAAAHB5mRZIjx8/rpKSEvn4+Di0+/j4yGazVXlcm81W6THj4uLk6elp3wICAqr8/gAAAACAijF9USOLxeLw2jCMUm2Xe8yYmBjl5ubat8zMzD/0/gAAAACAS3M26429vb3l5ORU6spldnZ2qSucleHr61vpMa1Wq6xWa5XfEwAAAABQeaZdIXV1dVVwcLCSk5Md2pOTkxUWFlblcUNDQ0uNuXnz5j80JgAAAACg+pl2hVSSoqOjNXLkSHXu3FmhoaFatmyZjhw5onHjxkk6dyvtjz/+qFWrVtmPycjIkCTl5+fr2LFjysjIkKurq9q1aydJmjx5su68807NmTNHd999t959911t2bJFu3fvrvHzAwAAAACUz9RAOmzYMJ04cULPPPOMsrKyFBQUpI0bNyowMFCSlJWVVeqZpJ06dbL/Oj09XWvWrFFgYKC+//57SVJYWJjWrVunv//973rqqad04403KikpSSEhITV2XgAAAACASzM1kErS+PHjNX78+DL3JSYmlmozDOOSY95333267777/mhpAAAAAIDLyPRVdgEAAAAAdROBFAAAAABgCgIpAAAAAMAUBFIAAGqBnTt3auDAgfL395fFYtE777zjsH/06NGyWCwOW5cuXRz6FBYWatKkSfL29lb9+vU1aNAgHT16tAbPAgBQ1xBIAQCoBU6fPq2OHTtq0aJF5fbp37+/srKy7NvGjRsd9kdFRWnDhg1at26ddu/erfz8fEVGRqqkpORylw8AqKNMX2UXAAD8cREREYqIiLhoH6vVKl9f3zL35ebmavny5XrttdfUp08fSdLq1asVEBCgLVu2qF+/fmUeV1hYqMLCQvvrvLy8Kp4BAKAu4gopAAB1xPbt29W0aVO1atVKDz/8sLKzs+370tPTVVxcrPDwcHubv7+/goKClJKSUu6YcXFx8vT0tG8BAQGX9RwAALULgRQAgDogIiJCr7/+urZt26Z58+YpLS1NvXr1sl/dtNlscnV1VaNGjRyO8/Hxkc1mK3fcmJgY5ebm2rfMzMzLeh4AgNqFW3YBAKgDhg0bZv91UFCQOnfurMDAQH3wwQcaMmRIuccZhiGLxVLufqvVKqvVWq21AgDqDq6QAgBQB/n5+SkwMFCHDh2SJPn6+qqoqEg5OTkO/bKzs+Xj42NGiQCAOoBACgBAHXTixAllZmbKz89PkhQcHCwXFxclJyfb+2RlZemzzz5TWFiYWWUCAGo5btkFAKAWyM/P17fffmt/ffjwYWVkZKhx48Zq3LixYmNjde+998rPz0/ff/+9pk+fLm9vbw0ePFiS5OnpqTFjxmjKlCny8vJS48aNNXXqVLVv396+6i4AANWNQAoAQC2wd+9e9ezZ0/46OjpakvTggw9q6dKlOnjwoFatWqWTJ0/Kz89PPXv2VFJSkjw8POzHLFiwQM7Ozho6dKgKCgrUu3dvJSYmysnJqcbPBwBQNxBIAQCoBXr06CHDMMrd/9FHH11yDDc3NyUkJCghIaE6SwMAoFx8hxQAAAAAYAoCKQAAAADAFNyyiyvOWZf6Zf4aAAAAQO1CIMUVJ791hNklAAAAAKgB3LILAAAAADAFgRQAAAAAYAoCKQAAAADAFARSAAAAAIApCKQAAAAAAFMQSAEAAAAApiCQAgAAAABMQSAFAAAAAJiCQAoAAAAAMAWBFAAAAABgCgIpAAAAAMAUBFIAAAAAgCkIpAAAAAAAUxBIAQAAAACmIJACAAAAAExBIAUAAAAAmIJACgAAAAAwBYEUAAAAAGAKAikAAAAAwBQEUgAAAACAKQikAAAAAABTEEgBAAAAAKYgkAIAAAAATEEgBQAAAACYgkAKAAAAADAFgRQAAAAAYAoCKQAAAADAFARSAAAAAIApCKQAAAAAAFMQSAEAAAAApiCQAgAAAABMQSAFAKAW2LlzpwYOHCh/f39ZLBa98847DvsNw1BsbKz8/f3l7u6uHj166PPPP3foU1hYqEmTJsnb21v169fXoEGDdPTo0Ro8CwBAXUMgBQCgFjh9+rQ6duyoRYsWlbl/7ty5mj9/vhYtWqS0tDT5+vqqb9++OnXqlL1PVFSUNmzYoHXr1mn37t3Kz89XZGSkSkpKauo0AAB1jLPZBQAAgD8uIiJCERERZe4zDEPx8fGaMWOGhgwZIkl69dVX5ePjozVr1ujRRx9Vbm6uli9frtdee019+vSRJK1evVoBAQHasmWL+vXrV2PnAgCoO7hCCgBALXf48GHZbDaFh4fb26xWq7p3766UlBRJUnp6uoqLix36+Pv7KygoyN6nLIWFhcrLy3PYAACoKAIpAAC1nM1mkyT5+Pg4tPv4+Nj32Ww2ubq6qlGjRuX2KUtcXJw8PT3tW0BAQDVXDwCozQikAADUERaLxeG1YRil2i50qT4xMTHKzc21b5mZmdVSKwCgbiCQAgBQy/n6+kpSqSud2dnZ9qumvr6+KioqUk5OTrl9ymK1WtWwYUOHDQCAiiKQAgBQy7Vo0UK+vr5KTk62txUVFWnHjh0KCwuTJAUHB8vFxcWhT1ZWlj777DN7HwAAqhur7AIAUAvk5+fr22+/tb8+fPiwMjIy1LhxYzVv3lxRUVGaPXu2WrZsqZYtW2r27Nm65pprNGLECEmSp6enxowZoylTpsjLy0uNGzfW1KlT1b59e/uquwAAVDcCKQAAtcDevXvVs2dP++vo6GhJ0oMPPqjExERNmzZNBQUFGj9+vHJychQSEqLNmzfLw8PDfsyCBQvk7OysoUOHqqCgQL1791ZiYqKcnJxq/HwAAHUDgRQAgFqgR48eMgyj3P0Wi0WxsbGKjY0tt4+bm5sSEhKUkJBwGSoEAKA0vkMKAAAAADAFgRQAAAAAYAoCKQAAAADAFARSAAAAAIApCKQAAAAAAFMQSAEAAAAApjA9kC5ZskQtWrSQm5ubgoODtWvXrov237Fjh4KDg+Xm5qYbbrhBL730ksP+xMREWSyWUttvv/12OU8DAAAAAFBJpgbSpKQkRUVFacaMGdq3b5+6deumiIgIHTlypMz+hw8f1oABA9StWzft27dP06dP1+OPP67169c79GvYsKGysrIcNjc3t5o4JQAAAABABTmb+ebz58/XmDFjNHbsWElSfHy8PvroIy1dulRxcXGl+r/00ktq3ry54uPjJUlt27bV3r179cILL+jee++197NYLPL19a1wHYWFhSosLLS/zsvLq+IZAQAAAAAqyrQrpEVFRUpPT1d4eLhDe3h4uFJSUso8JjU1tVT/fv36ae/evSouLra35efnKzAwUM2aNVNkZKT27dt30Vri4uLk6elp3wICAqp4VgAAAACAijItkB4/flwlJSXy8fFxaPfx8ZHNZivzGJvNVmb/M2fO6Pjx45KkNm3aKDExUe+9957Wrl0rNzc3de3aVYcOHSq3lpiYGOXm5tq3zMzMP3h2AAAAAIBLMfWWXenc7bW/ZxhGqbZL9f99e5cuXdSlSxf7/q5du+rWW29VQkKCXnzxxTLHtFqtslqtVaofAAAAAFA1pl0h9fb2lpOTU6mrodnZ2aWugp7n6+tbZn9nZ2d5eXmVeUy9evV02223XfQKKQAAAACg5pkWSF1dXRUcHKzk5GSH9uTkZIWFhZV5TGhoaKn+mzdvVufOneXi4lLmMYZhKCMjQ35+ftVTOAAAAACgWpj62Jfo6Gj961//0ooVK/Tll1/qiSee0JEjRzRu3DhJ577bOWrUKHv/cePG6YcfflB0dLS+/PJLrVixQsuXL9fUqVPtfWbNmqWPPvpI3333nTIyMjRmzBhlZGTYxwQAAAAAXBlM/Q7psGHDdOLECT3zzDPKyspSUFCQNm7cqMDAQElSVlaWwzNJW7RooY0bN+qJJ57Q4sWL5e/vrxdffNHhkS8nT57UI488IpvNJk9PT3Xq1Ek7d+7U7bffXuPnBwAAAAAon+mLGo0fP17jx48vc19iYmKptu7du+vTTz8td7wFCxZowYIF1VUeAAAAAOAyMfWWXQAAAABA3UUgBQAAAACYgkAKAAAAADAFgRQAAAAAYAoCKQAAAADAFARSAAAAAIApCKQAAAAAAFMQSAEAAAAApiCQAgAAAABMQSAFAAAAAJiCQAoAAAAAMAWBFAAAAABgCgIpAAAAAMAUBFIAAAAAgCkIpAAAAAAAUxBIAQAAAACmIJACAAAAAExBIAUAAAAAmIJACgBAHREbGyuLxeKw+fr62vcbhqHY2Fj5+/vL3d1dPXr00Oeff25ixQCA2o5ACgBAHXLzzTcrKyvLvh08eNC+b+7cuZo/f74WLVqktLQ0+fr6qm/fvjp16pSJFQMAajMCKQAAdYizs7N8fX3tW5MmTSSduzoaHx+vGTNmaMiQIQoKCtKrr76qX3/9VWvWrDG5agBAbUUgBQCgDjl06JD8/f3VokULDR8+XN99950k6fDhw7LZbAoPD7f3tVqt6t69u1JSUsodr7CwUHl5eQ4bAAAVRSAFAKCOCAkJ0apVq/TRRx/plVdekc1mU1hYmE6cOCGbzSZJ8vHxcTjGx8fHvq8scXFx8vT0tG8BAQGX9RwAALULgRQAgDoiIiJC9957r9q3b68+ffrogw8+kCS9+uqr9j4Wi8XhGMMwSrX9XkxMjHJzc+1bZmbm5SkeAFArEUgBAKij6tevr/bt2+vQoUP21XYvvBqanZ1d6qrp71mtVjVs2NBhAwCgogikAADUUYWFhfryyy/l5+enFi1ayNfXV8nJyfb9RUVF2rFjh8LCwkysEgBQmzmbXQAAAKgZU6dO1cCBA9W8eXNlZ2fr2WefVV5enh588EFZLBZFRUVp9uzZatmypVq2bKnZs2frmmuu0YgRI8wuHQBQSxFIAQCoI44ePar7779fx48fV5MmTdSlSxft2bNHgYGBkqRp06apoKBA48ePV05OjkJCQrR582Z5eHiYXDkAoLYikAIAUEesW7fuovstFotiY2MVGxtbMwUBAOo8vkMKAAAAADAFgRQAAAAAYAoCKQAAAADAFARSAAAAAIApCKQAAAAAAFMQSAEAAAAApiCQAgAAAABMQSAFAAAAAJiCQAoAAAAAMAWBFAAAAABgCgIpAAAAAMAUBFIAAAAAgCkIpAAAAAAAUxBIAQAAAACmIJACAAAAAExBIAUAAAAAmIJACgAAAAAwBYEUAAAAAGAKAikAAAAAwBQEUgAAAACAKQikAAAAAABTEEgBAAAAAKYgkAIAAAAATEEgBQAAAACYgkAKAAAAADAFgRQAAAAAYAoCKQAAAADAFARSAAAAAIApCKQAAAAAAFMQSAEAAAAApiCQAgAAAABMQSAFAAAAAJiCQAoAAAAAMAWBFAAAAABgCgIpAAAAAMAUBFIAAAAAgClMD6RLlixRixYt5ObmpuDgYO3ateui/Xfs2KHg4GC5ubnphhtu0EsvvVSqz/r169WuXTtZrVa1a9dOGzZsuFzlAwBQK1V2fgYAoCpMDaRJSUmKiorSjBkztG/fPnXr1k0RERE6cuRImf0PHz6sAQMGqFu3btq3b5+mT5+uxx9/XOvXr7f3SU1N1bBhwzRy5Ejt379fI0eO1NChQ/XJJ5/U1GkBAHBVq+z8DABAVZkaSOfPn68xY8Zo7Nixatu2reLj4xUQEKClS5eW2f+ll15S8+bNFR8fr7Zt22rs2LH6y1/+ohdeeMHeJz4+Xn379lVMTIzatGmjmJgY9e7dW/Hx8TV0VgAAXN0qOz8DAFBVzma9cVFRkdLT0/W3v/3NoT08PFwpKSllHpOamqrw8HCHtn79+mn58uUqLi6Wi4uLUlNT9cQTT5Tqc7FAWlhYqMLCQvvr3NxcSVJeXl5lTumiSgoLqm0s4GKq889tdTr1W4nZJaCOqM6/A+fHMgyj2sa80lV2fmYORW3CHIq6zow51LRAevz4cZWUlMjHx8eh3cfHRzabrcxjbDZbmf3PnDmj48ePy8/Pr9w+5Y0pSXFxcZo1a1ap9oCAgIqeDnDF8EwYZ3YJgLniPKt9yFOnTsnTs/rHvRJVdn5mDkVtwhyKOs+EOdS0QHqexWJxeG0YRqm2S/W/sL2yY8bExCg6Otr++uzZs/rll1/k5eV10eNw+eTl5SkgIECZmZlq2LCh2eUApuDvgfkMw9CpU6fk7+9vdik1rqJzKXPolYd/OwD+HlwJKjqHmhZIvb295eTkVOqnrdnZ2aV+Knuer69vmf2dnZ3l5eV10T7ljSlJVqtVVqvVoe3aa6+t6KngMmrYsCH/iKDO4++BuerKldHzKjs/M4deufi3A+DvgdkqMoeatqiRq6urgoODlZyc7NCenJyssLCwMo8JDQ0t1X/z5s3q3LmzXFxcLtqnvDEBAMD/qcr8DABAVZl6y250dLRGjhypzp07KzQ0VMuWLdORI0c0bty5+/djYmL0448/atWqVZKkcePGadGiRYqOjtbDDz+s1NRULV++XGvXrrWPOXnyZN15552aM2eO7r77br377rvasmWLdu/ebco5AgBwtbnU/AwAQHUxNZAOGzZMJ06c0DPPPKOsrCwFBQVp48aNCgwMlCRlZWU5PPOsRYsW2rhxo5544gktXrxY/v7+evHFF3Xvvffa+4SFhWndunX6+9//rqeeeko33nijkpKSFBISUuPnh6qzWq2aOXNmqdvAgLqEvwcwy6XmZ1zZ+LcD4O/B1cRi1KW17AEAAAAAVwzTvkMKAAAAAKjbCKQAAAAAAFMQSAEAAAAApiCQAgAAAABMQSDFFSklJUVOTk7q37+/2aUANWr06NGyWCz2zcvLS/3799eBAwfMLg3AVYI5FHUVc+jViUCKK9KKFSs0adIk7d692+HRP0Bd0L9/f2VlZSkrK0tbt26Vs7OzIiMjzS4LwFWCORR1GXPo1YdAiivO6dOn9cYbb+ixxx5TZGSkEhMTzS4JqFFWq1W+vr7y9fXVLbfcoieffFKZmZk6duyY2aUBuMIxh6KuYw69+hBIccVJSkpS69at1bp1az3wwANauXKleFwu6qr8/Hy9/vrruummm+Tl5WV2OQCucMyhwP9hDr06OJtdAHCh5cuX64EHHpB07raL/Px8bd26VX369DG5MqBmvP/++2rQoIGkc1c7/Pz89P7776tePX6GCODimENR1zGHXn34ncEV5euvv9Z///tfDR8+XJLk7OysYcOGacWKFSZXBtScnj17KiMjQxkZGfrkk08UHh6uiIgI/fDDD2aXBuAKxhwKMIdejbhCiivK8uXLdebMGV133XX2NsMw5OLiopycHDVq1MjE6oCaUb9+fd10003218HBwfL09NQrr7yiZ5991sTKAFzJmEMB5tCrEVdIccU4c+aMVq1apXnz5tl/spWRkaH9+/crMDBQr7/+utklAqawWCyqV6+eCgoKzC4FwBWKORQoG3PolY8rpLhivP/++8rJydGYMWPk6enpsO++++7T8uXLNXHiRJOqA2pOYWGhbDabJCknJ0eLFi1Sfn6+Bg4caHJlAK5UzKHAOcyhVx+ukOKKsXz5cvXp06fURCpJ9957rzIyMvTpp5+aUBlQszZt2iQ/Pz/5+fkpJCREaWlpevPNN9WjRw+zSwNwhWIOBc5hDr36WAzWAgcAAAAAmIArpAAAAAAAUxBIAQAAAACmIJACAAAAAExBIAUAAAAAmIJACgAAAAAwBYEUAAAAAGAKAikAAAAAwBQEUgAAAACAKQikAAAAAABTEEgBSJJGjx4ti8VSauvfv78k6frrr1d8fLy5RQIAcAViDgWqztnsAgBcOfr376+VK1c6tFmtVpOqAQDg6sEcClQNgRSAndVqla+vr9llAABw1WEOBaqGW3YBAAAAAKYgkAKwe//999WgQQOH7R//+IfZZQEAcMVjDgWqhlt2Adj17NlTS5cudWhr3LixSdUAAHD1YA4FqoZACsCufv36uummm8wuAwCAqw5zKFA13LILAAAAADAFV0gB2BUWFspmszm0OTs7y9vb26SKAAC4OjCHAlVDIAVgt2nTJvn5+Tm0tW7dWl999ZXOnj0rZ2f+yQAAoCzMoUDVcMsuAElSYmKiDMMotX311VcqKSnRiRMneL4aAABlYA4Fqo4f1QC4qKNHj2rVqlUqKSnRHXfcYXY5AABcNZhDgUsjkAK4qFtuuUVeXl567bXX+OkuAACVwBwKXJrFMAzD7CIAAAAAAHUP3yEFAAAAAJiCQAoAAAAAMAWBFAAAAABgCgIpAAAAAMAUBFIAAAAAgCkIpAAAAAAAUxBIAQAAAACmIJACAAAAAEzx/wB2ErnHWEyyJgAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, axs = plt.subplots(1, 2, figsize=(11, 5))\n", + "sns.barplot(data=train_df, y=dependent, x=\"EJ\", ax=axs[0]).set(title=\"Positive Diagnosis Rate\")\n", + "sns.countplot(data=train_df, x=\"EJ\", ax=axs[1]).set(title=\"Histogram\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "9213fd64", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:16.159486Z", + "iopub.status.busy": "2023-07-24T16:44:16.158831Z", + "iopub.status.idle": "2023-07-24T16:44:16.677796Z", + "shell.execute_reply": "2023-07-24T16:44:16.676554Z" + }, + "papermill": { + "duration": 0.544704, + "end_time": "2023-07-24T16:44:16.680245", + "exception": false, + "start_time": "2023-07-24T16:44:16.135541", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(11, 5))\n", + "sns.boxenplot(data=train_df, x=dependent, y=\"AB\", ax=ax1)\n", + "sns.kdeplot(data=train_df, x=\"AB\", ax=ax2)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "8e026db2", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:16.727198Z", + "iopub.status.busy": "2023-07-24T16:44:16.726717Z", + "iopub.status.idle": "2023-07-24T16:44:16.733786Z", + "shell.execute_reply": "2023-07-24T16:44:16.732415Z" + }, + "papermill": { + "duration": 0.033511, + "end_time": "2023-07-24T16:44:16.736496", + "exception": false, + "start_time": "2023-07-24T16:44:16.702985", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def score(col, y, split_value):\n", + " lhs = col <= split_value\n", + " return (_side_score(lhs, y) + _side_score(~lhs, y))/len(y)\n", + "\n", + "def _side_score(side, y):\n", + " count = side.sum()\n", + " if count <=1: return 0\n", + " return y[side].std() * count " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "0e8ba4c9", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:16.785021Z", + "iopub.status.busy": "2023-07-24T16:44:16.784601Z", + "iopub.status.idle": "2023-07-24T16:44:16.794431Z", + "shell.execute_reply": "2023-07-24T16:44:16.793242Z" + }, + "papermill": { + "duration": 0.037087, + "end_time": "2023-07-24T16:44:16.796937", + "exception": false, + "start_time": "2023-07-24T16:44:16.759850", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.36017711175753037" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "score(train_df.AB, train_df[dependent], 0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d3c0d15d", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:16.845942Z", + "iopub.status.busy": "2023-07-24T16:44:16.845463Z", + "iopub.status.idle": "2023-07-24T16:44:16.860957Z", + "shell.execute_reply": "2023-07-24T16:44:16.859816Z" + }, + "papermill": { + "duration": 0.043549, + "end_time": "2023-07-24T16:44:16.863292", + "exception": false, + "start_time": "2023-07-24T16:44:16.819743", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EJ
01
10
21
31
41
\n", + "
" + ], + "text/plain": [ + " EJ\n", + "0 1\n", + "1 0\n", + "2 1\n", + "3 1\n", + "4 1" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df_n = train_df.copy()\n", + "train_df_n[categoricals] = train_df_n[categoricals].apply(lambda x: x.cat.codes)\n", + "train_df_n[categoricals].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "4fb70875", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:16.910659Z", + "iopub.status.busy": "2023-07-24T16:44:16.910276Z", + "iopub.status.idle": "2023-07-24T16:44:16.918831Z", + "shell.execute_reply": "2023-07-24T16:44:16.917666Z" + }, + "papermill": { + "duration": 0.035136, + "end_time": "2023-07-24T16:44:16.921154", + "exception": false, + "start_time": "2023-07-24T16:44:16.886018", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.3803100751041243" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "score(train_df_n.EJ, train_df_n[dependent], 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "74ede026", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:16.968332Z", + "iopub.status.busy": "2023-07-24T16:44:16.967926Z", + "iopub.status.idle": "2023-07-24T16:44:16.978154Z", + "shell.execute_reply": "2023-07-24T16:44:16.977068Z" + }, + "papermill": { + "duration": 0.036582, + "end_time": "2023-07-24T16:44:16.980412", + "exception": false, + "start_time": "2023-07-24T16:44:16.943830", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0.081187 , 0.08546 , 0.098279 , 0.102552 , 0.111098 , 0.119644 ,\n", + " 0.1217805, 0.1303265, 0.132463 , 0.136736 , 0.141009 , 0.145282 ,\n", + " 0.149555 , 0.153828 , 0.1559645, 0.158101 , 0.1602375, 0.162374 ,\n", + " 0.166647 , 0.17092 , 0.175193 , 0.179466 , 0.183739 , 0.1858755,\n", + " 0.188012 , 0.1901485, 0.192285 , 0.196558 , 0.200831 , 0.2029675,\n", + " 0.205104 , 0.209377 , 0.21365 , 0.217923 , 0.222196 , 0.2243325,\n", + " 0.226469 , 0.230742 , 0.235015 , 0.239288 , 0.243561 , 0.247834 ,\n", + " 0.252107 , 0.25638 , 0.260653 , 0.2627895, 0.264926 , 0.269199 ,\n", + " 0.2713355, 0.273472 ])" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "col = train_df_n[\"AB\"]\n", + "y = train_df_n[dependent]\n", + "uniques = col.unique()\n", + "uniques.sort()\n", + "uniques[:50]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "413a02f2", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:17.030099Z", + "iopub.status.busy": "2023-07-24T16:44:17.029658Z", + "iopub.status.idle": "2023-07-24T16:44:17.247190Z", + "shell.execute_reply": "2023-07-24T16:44:17.245780Z" + }, + "papermill": { + "duration": 0.244833, + "end_time": "2023-07-24T16:44:17.249646", + "exception": false, + "start_time": "2023-07-24T16:44:17.004813", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.410208" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scores = np.array([score(col, y, split) for split in uniques])\n", + "uniques[scores.argmin()]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "eed27115", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:17.299049Z", + "iopub.status.busy": "2023-07-24T16:44:17.298621Z", + "iopub.status.idle": "2023-07-24T16:44:17.305637Z", + "shell.execute_reply": "2023-07-24T16:44:17.304339Z" + }, + "papermill": { + "duration": 0.035016, + "end_time": "2023-07-24T16:44:17.308302", + "exception": false, + "start_time": "2023-07-24T16:44:17.273286", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def min_column(df, col_name):\n", + " col, y = df[col_name], df[dependent]\n", + " uniques = col.unique()\n", + " scores = np.array([score(col, y, split) for split in uniques])\n", + " index = scores.argmin()\n", + " return uniques[index], scores[index]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "0e6b0a27", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:17.357431Z", + "iopub.status.busy": "2023-07-24T16:44:17.357028Z", + "iopub.status.idle": "2023-07-24T16:44:17.570213Z", + "shell.execute_reply": "2023-07-24T16:44:17.568909Z" + }, + "papermill": { + "duration": 0.24057, + "end_time": "2023-07-24T16:44:17.572930", + "exception": false, + "start_time": "2023-07-24T16:44:17.332360", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.410208, 0.3561906829723693)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "min_column(train_df_n, \"AB\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "43357eb1", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:17.622176Z", + "iopub.status.busy": "2023-07-24T16:44:17.621775Z", + "iopub.status.idle": "2023-07-24T16:44:17.634131Z", + "shell.execute_reply": "2023-07-24T16:44:17.632967Z" + }, + "papermill": { + "duration": 0.039634, + "end_time": "2023-07-24T16:44:17.636535", + "exception": false, + "start_time": "2023-07-24T16:44:17.596901", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(0, 0.3773339803468088)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "min_column(train_df_n, \"EJ\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "a6492aec", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:17.686744Z", + "iopub.status.busy": "2023-07-24T16:44:17.686346Z", + "iopub.status.idle": "2023-07-24T16:44:38.570938Z", + "shell.execute_reply": "2023-07-24T16:44:38.569633Z" + }, + "papermill": { + "duration": 20.912843, + "end_time": "2023-07-24T16:44:38.573740", + "exception": false, + "start_time": "2023-07-24T16:44:17.660897", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "columns = conts + categoricals\n", + "splits = {col: min_column(train_df_n, col) for col in columns}" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "bb3dc767", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:38.623190Z", + "iopub.status.busy": "2023-07-24T16:44:38.622692Z", + "iopub.status.idle": "2023-07-24T16:44:38.634639Z", + "shell.execute_reply": "2023-07-24T16:44:38.633366Z" + }, + "papermill": { + "duration": 0.039474, + "end_time": "2023-07-24T16:44:38.636901", + "exception": false, + "start_time": "2023-07-24T16:44:38.597427", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'AB': (0.410208, 0.3561906829723693),\n", + " 'AF': (2808.64232, 0.35583548065304915),\n", + " 'AH': (193.801377, 0.3761985927636321),\n", + " 'AM': (149.318758, 0.367338331639673),\n", + " 'AR': (16.327194, 0.3678281215243884),\n", + " 'AX': (17.877462, 0.37677322461475915),\n", + " 'AY': (0.6019965, 0.37677322461475915),\n", + " 'AZ': (10.971782, 0.3782810033906922),\n", + " 'BC': (13.500788, 0.35968813799570454),\n", + " 'BD ': (12083.34891, 0.3749922782916859),\n", + " 'BN': (21.186, 0.3651185274145938),\n", + " 'BP': (196.710795, 0.3725348222558456),\n", + " 'BQ': (115.695865, 0.3668876611761174),\n", + " 'BR': (3466.745415, 0.374094640781585),\n", + " 'BZ': (2885.319798, 0.376211243844383),\n", + " 'CB': (13.32695, 0.37775055805021784),\n", + " 'CC': (0.5478777, 0.3665150807004323),\n", + " 'CD ': (85.955376, 0.3635292346687648),\n", + " 'CF': (1.8504485, 0.367406050093466),\n", + " 'CH': (0.016318, 0.3782672405025747),\n", + " 'CL': (1.24754, 0.37657306471010377),\n", + " 'CR': (0.527325, 0.35228611051878406),\n", + " 'CS': (62.2516675, 0.37277362672289577),\n", + " 'CU': (1.274427, 0.37534114864714596),\n", + " 'CW ': (35.67944, 0.3772898765041537),\n", + " 'DA': (27.36564, 0.353137425790351),\n", + " 'DE': (149.18453, 0.3625701653246753),\n", + " 'DF': (0.500175, 0.36551421649454285),\n", + " 'DH': (0.240504, 0.3671902453269601),\n", + " 'DI': (253.8155925, 0.3550252288774408),\n", + " 'DL': (134.1642, 0.37162996473999704),\n", + " 'DN': (59.068544, 0.3749922782916859),\n", + " 'DU': (2.27601, 0.3207517917154871),\n", + " 'DV': (2.19891, 0.37819979248926816),\n", + " 'DY': (4.474032, 0.3708655621394996),\n", + " 'EB': (6.269316, 0.3625606667600181),\n", + " 'EE': (1.463253, 0.36191183975348784),\n", + " 'EG': (6845.912275, 0.37731357741606586),\n", + " 'EH': (0.389376, 0.3593626675299252),\n", + " 'EL': (46.05744, 0.37829464648148575),\n", + " 'EP': (224.078075, 0.37429306818140773),\n", + " 'EU': (8.497392, 0.3699652228963962),\n", + " 'FC': (13.33752, 0.37580636037706155),\n", + " 'FD ': (8.151501, 0.3607755013721559),\n", + " 'FE': (15667.04141, 0.3658514670660312),\n", + " 'FI': (8.9724075, 0.3657210528731738),\n", + " 'FL': (7.925430474, 0.3436877494985317),\n", + " 'FR': (2.73702, 0.365585424163423),\n", + " 'FS': (0.839852, 0.37680072033409695),\n", + " 'GB': (40.435794, 0.37726637931071944),\n", + " 'GE': (363.134821, 0.37718575251296815),\n", + " 'GF': (14737.27446, 0.37078665888029855),\n", + " 'GH': (61.028121, 0.3762112438443829),\n", + " 'GI': (117.6047, 0.3767203577225661),\n", + " 'GL': (0.121055405, 0.3450719268397208),\n", + " 'EJ': (0, 0.3773339803468088)}" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "splits" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "52639ab5", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:38.686199Z", + "iopub.status.busy": "2023-07-24T16:44:38.685810Z", + "iopub.status.idle": "2023-07-24T16:44:38.694136Z", + "shell.execute_reply": "2023-07-24T16:44:38.692987Z" + }, + "papermill": { + "duration": 0.035929, + "end_time": "2023-07-24T16:44:38.696448", + "exception": false, + "start_time": "2023-07-24T16:44:38.660519", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('DU', (2.27601, 0.3207517917154871))" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "min_score = 1\n", + "for k, v in splits.items():\n", + " if v[1] < min_score:\n", + " min_score = v[1]\n", + " min_score_column = k\n", + "min_score_column, splits[min_score_column]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "db228b40", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:38.746055Z", + "iopub.status.busy": "2023-07-24T16:44:38.745633Z", + "iopub.status.idle": "2023-07-24T16:44:38.754438Z", + "shell.execute_reply": "2023-07-24T16:44:38.753288Z" + }, + "papermill": { + "duration": 0.036495, + "end_time": "2023-07-24T16:44:38.756872", + "exception": false, + "start_time": "2023-07-24T16:44:38.720377", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "random.seed(42)\n", + "model_train, model_val = train_test_split(train_df_n, test_size=0.25)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "af257808", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:38.806052Z", + "iopub.status.busy": "2023-07-24T16:44:38.805634Z", + "iopub.status.idle": "2023-07-24T16:44:38.813349Z", + "shell.execute_reply": "2023-07-24T16:44:38.812218Z" + }, + "papermill": { + "duration": 0.035087, + "end_time": "2023-07-24T16:44:38.815789", + "exception": false, + "start_time": "2023-07-24T16:44:38.780702", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((462, 58), (155, 58))" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_train.shape, model_val.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "826b8577", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:38.865624Z", + "iopub.status.busy": "2023-07-24T16:44:38.865223Z", + "iopub.status.idle": "2023-07-24T16:44:38.874010Z", + "shell.execute_reply": "2023-07-24T16:44:38.872794Z" + }, + "papermill": { + "duration": 0.036331, + "end_time": "2023-07-24T16:44:38.876334", + "exception": false, + "start_time": "2023-07-24T16:44:38.840003", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def x_y(df):\n", + " return df[conts + categoricals], df[dependent]\n", + "\n", + "model_train_x, model_train_y = x_y(model_train)\n", + "model_val_x, model_val_y = x_y(model_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "200f56c2", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:38.926349Z", + "iopub.status.busy": "2023-07-24T16:44:38.925940Z", + "iopub.status.idle": "2023-07-24T16:44:54.658739Z", + "shell.execute_reply": "2023-07-24T16:44:54.657510Z" + }, + "papermill": { + "duration": 15.761202, + "end_time": "2023-07-24T16:44:54.661616", + "exception": false, + "start_time": "2023-07-24T16:44:38.900414", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "one_r_splits = {col: min_column(model_train, col) for col in model_train_x.columns}" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "f8283bdf", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:54.712831Z", + "iopub.status.busy": "2023-07-24T16:44:54.712370Z", + "iopub.status.idle": "2023-07-24T16:44:54.724108Z", + "shell.execute_reply": "2023-07-24T16:44:54.722940Z" + }, + "papermill": { + "duration": 0.039814, + "end_time": "2023-07-24T16:44:54.726650", + "exception": false, + "start_time": "2023-07-24T16:44:54.686836", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'AB': (0.581128, 0.3500151266645586),\n", + " 'AF': (2807.2121, 0.3482960305933906),\n", + " 'AH': (196.117971, 0.37031622929971825),\n", + " 'AM': (113.124119, 0.3677805771621117),\n", + " 'AR': (15.870966, 0.36607803019531715),\n", + " 'AX': (17.877462, 0.3701715230565572),\n", + " 'AY': (0.038976, 0.372511904676532),\n", + " 'AZ': (23.626198, 0.3735437010139049),\n", + " 'BC': (13.500788, 0.35865783745331065),\n", + " 'BD ': (10563.27538, 0.37017152305655715),\n", + " 'BN': (20.8329, 0.36082123208699335),\n", + " 'BP': (282.245337, 0.36079488319104713),\n", + " 'BQ': (69.65249, 0.36180119785038894),\n", + " 'BR': (5161.327397, 0.36527011004085347),\n", + " 'BZ': (2885.319798, 0.36963503519601426),\n", + " 'CB': (12.931737, 0.37244503925274197),\n", + " 'CC': (0.61375644, 0.3617285103240913),\n", + " 'CD ': (85.68186, 0.3631411826566342),\n", + " 'CF': (1.8504485, 0.36160897217069404),\n", + " 'CH': (0.012338, 0.3725637613110524),\n", + " 'CL': (1.24754, 0.3725459888575431),\n", + " 'CR': (0.527325, 0.34941117879168376),\n", + " 'CS': (62.2516675, 0.3671272819598272),\n", + " 'CU': (1.125468, 0.366462462615929),\n", + " 'CW ': (35.67944, 0.3672847631433858),\n", + " 'DA': (29.1194, 0.3463316955107069),\n", + " 'DE': (155.36692, 0.36119828797693376),\n", + " 'DF': (0.500175, 0.3650918054126017),\n", + " 'DH': (0.2719335, 0.362288870397952),\n", + " 'DI': (192.67764, 0.34941117879168376),\n", + " 'DL': (134.1642, 0.3646293261032984),\n", + " 'DN': (21.024744, 0.36748569748798376),\n", + " 'DU': (2.262216, 0.3196074664465314),\n", + " 'DV': (1.74307, 0.37176172009930203),\n", + " 'DY': (3.890072, 0.3676248337188574),\n", + " 'EB': (6.269316, 0.35697558261424034),\n", + " 'EE': (1.463253, 0.35748838200348587),\n", + " 'EG': (6845.912275, 0.3720728224305451),\n", + " 'EH': (1.521, 0.3527183592451205),\n", + " 'EL': (11.094798, 0.3725637613110524),\n", + " 'EP': (224.078075, 0.36960833445153146),\n", + " 'EU': (6.537624, 0.36450134846788734),\n", + " 'FC': (13.286112, 0.3705960987901368),\n", + " 'FD ': (8.151501, 0.35395076856984536),\n", + " 'FE': (28141.50595, 0.3638562145037893),\n", + " 'FI': (8.9724075, 0.3603920499475795),\n", + " 'FL': (6.720894827, 0.34161499102282883),\n", + " 'FR': (2.86346, 0.3523235128808954),\n", + " 'FS': (0.88049, 0.36984546183970063),\n", + " 'GB': (34.845292, 0.36994696490448187),\n", + " 'GE': (771.058022, 0.3735437010139049),\n", + " 'GF': (14412.89425, 0.36657418543636233),\n", + " 'GH': (62.47559, 0.3677289926805099),\n", + " 'GI': (105.603748, 0.3712885732260451),\n", + " 'GL': (0.121055405, 0.34590901176068733),\n", + " 'EJ': (0, 0.37298104861476167)}" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "one_r_splits" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "ac1adab1", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:54.777147Z", + "iopub.status.busy": "2023-07-24T16:44:54.776679Z", + "iopub.status.idle": "2023-07-24T16:44:54.786283Z", + "shell.execute_reply": "2023-07-24T16:44:54.785073Z" + }, + "papermill": { + "duration": 0.037629, + "end_time": "2023-07-24T16:44:54.788679", + "exception": false, + "start_time": "2023-07-24T16:44:54.751050", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('DU', (2.262216, 0.3196074664465314))" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def get_best_split(splits):\n", + " min_score = 1\n", + " for k, v in splits.items():\n", + " if v[1] < min_score:\n", + " min_score = v[1]\n", + " min_score_column = k\n", + " return min_score_column, splits[min_score_column]\n", + "\n", + "column, split = get_best_split(one_r_splits)\n", + "column, split" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "ee885f6f", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:54.839569Z", + "iopub.status.busy": "2023-07-24T16:44:54.839186Z", + "iopub.status.idle": "2023-07-24T16:44:54.845066Z", + "shell.execute_reply": "2023-07-24T16:44:54.843841Z" + }, + "papermill": { + "duration": 0.034188, + "end_time": "2023-07-24T16:44:54.847548", + "exception": false, + "start_time": "2023-07-24T16:44:54.813360", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "preds = model_val_x[\"DU\"] <= 2.262216" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "bae2f1e0", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:54.897930Z", + "iopub.status.busy": "2023-07-24T16:44:54.897509Z", + "iopub.status.idle": "2023-07-24T16:44:54.905883Z", + "shell.execute_reply": "2023-07-24T16:44:54.904775Z" + }, + "papermill": { + "duration": 0.035999, + "end_time": "2023-07-24T16:44:54.908032", + "exception": false, + "start_time": "2023-07-24T16:44:54.872033", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.864516129032258" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_absolute_error(model_val_y, preds)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "f73a3e24", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:54.958253Z", + "iopub.status.busy": "2023-07-24T16:44:54.957855Z", + "iopub.status.idle": "2023-07-24T16:44:54.968775Z", + "shell.execute_reply": "2023-07-24T16:44:54.967628Z" + }, + "papermill": { + "duration": 0.039026, + "end_time": "2023-07-24T16:44:54.971216", + "exception": false, + "start_time": "2023-07-24T16:44:54.932190", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "25.950508209640226" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def balanced_logarithmic_loss(y_true, y_pred):\n", + " N_1 = np.sum(y_true == 1, axis=0)\n", + " N_0 = np.sum(y_true == 0, axis=0)\n", + "\n", + " y_pred = np.maximum(np.minimum(y_pred, 1 - 1e-15), 1e-15)\n", + " loss_numerator = - (1/N_0) * np.sum((1 - y_true) * np.log(1-y_pred)) - (1/N_1) * np.sum(y_true * np.log(y_pred))\n", + "\n", + " return loss_numerator / 2\n", + "\n", + "balanced_logarithmic_loss(model_val_y.to_numpy(), preds.to_numpy())" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "0353351c", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:55.022104Z", + "iopub.status.busy": "2023-07-24T16:44:55.021669Z", + "iopub.status.idle": "2023-07-24T16:44:55.031420Z", + "shell.execute_reply": "2023-07-24T16:44:55.030445Z" + }, + "papermill": { + "duration": 0.038255, + "end_time": "2023-07-24T16:44:55.033907", + "exception": false, + "start_time": "2023-07-24T16:44:54.995652", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((396, 58), (66, 58))" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lhs = model_train[\"DU\"] <= 2.262216\n", + "left_group = model_train[lhs]\n", + "right_group = model_train[~lhs]\n", + "left_group.shape, right_group.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "5c9ef555", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:44:55.086834Z", + "iopub.status.busy": "2023-07-24T16:44:55.086398Z", + "iopub.status.idle": "2023-07-24T16:45:10.755543Z", + "shell.execute_reply": "2023-07-24T16:45:10.754179Z" + }, + "papermill": { + "duration": 15.699515, + "end_time": "2023-07-24T16:45:10.758597", + "exception": false, + "start_time": "2023-07-24T16:44:55.059082", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "({'AB': (0.363205, 0.2205227532918278),\n", + " 'AF': (1744.8442, 0.26422367520400314),\n", + " 'AH': (186.276801, 0.28281828339030296),\n", + " 'AM': (16.333578, 0.2678889313583875),\n", + " 'AR': (15.870966, 0.2866140712941824),\n", + " 'AX': (5.244528, 0.2773853964685262),\n", + " 'AY': (0.6019965, 0.2874443162878375),\n", + " 'AZ': (8.186298, 0.2851965412728709),\n", + " 'BC': (13.500788, 0.254228411937528),\n", + " 'BD ': (10563.27538, 0.28342483094474047),\n", + " 'BN': (15.1833, 0.2815221656802702),\n", + " 'BP': (318.7728, 0.27010283692431253),\n", + " 'BQ': (11.865775, 0.2794184862139197),\n", + " 'BR': (5161.327397, 0.2752096532354187),\n", + " 'BZ': (1703.832302, 0.28396363710150363),\n", + " 'CB': (12.931737, 0.28032816063022753),\n", + " 'CC': (0.61375644, 0.2699890428677908),\n", + " 'CD ': (88.048368, 0.2693262549530457),\n", + " 'CF': (2.548358, 0.27644641567153155),\n", + " 'CH': (0.016716, 0.2836102268644721),\n", + " 'CL': (1.152065, 0.2817474069769845),\n", + " 'CR': (0.6729, 0.25077109773069917),\n", + " 'CS': (55.8007175, 0.2831938474420979),\n", + " 'CU': (1.059264, 0.2763697098465586),\n", + " 'CW ': (44.301128, 0.28402599531661465),\n", + " 'DA': (38.83104, 0.24522761095128734),\n", + " 'DE': (151.051665, 0.2694259159914128),\n", + " 'DF': (0.500175, 0.2709760197318968),\n", + " 'DH': (0.240504, 0.26672971464460576),\n", + " 'DI': (194.021745, 0.24826322692985892),\n", + " 'DL': (113.1762, 0.25651840267368864),\n", + " 'DN': (59.068544, 0.27934746754006307),\n", + " 'DV': (3.04325, 0.28815065732759965),\n", + " 'DY': (8.91662, 0.2751628343696961),\n", + " 'EB': (17.691204, 0.28033350160822096),\n", + " 'EE': (1.443098, 0.2525775803204561),\n", + " 'EG': (2330.295275, 0.28380953536491194),\n", + " 'EH': (0.03042, 0.2776192844889819),\n", + " 'EL': (24.567972, 0.2708402531316478),\n", + " 'EP': (224.078075, 0.2873304659747274),\n", + " 'EU': (6.537624, 0.22930957657304885),\n", + " 'FC': (26.663616, 0.28636197449296075),\n", + " 'FD ': (5.325489, 0.270404190514022),\n", + " 'FE': (15667.04141, 0.2782594217449274),\n", + " 'FI': (7.822947, 0.2738422457069847),\n", + " 'FL': (0.296625, 0.2858261205083379),\n", + " 'FR': (2.86346, 0.256970642908978),\n", + " 'FS': (1.21914, 0.2865342615078834),\n", + " 'GB': (34.845292, 0.2833625612804258),\n", + " 'GE': (771.058022, 0.29019098509591523),\n", + " 'GF': (1571.342004, 0.2846660155262617),\n", + " 'GH': (43.20081, 0.2795202987385677),\n", + " 'GI': (8.701076, 0.28277685434606015),\n", + " 'GL': (0.561, 0.2841560493396353),\n", + " 'EJ': (0, 0.2877201690556401)},\n", + " {'AB': (0.235015, 0.4583176806743689),\n", + " 'AF': (2668.16616, 0.4708737384176801),\n", + " 'AH': (170.365458, 0.47075213362693974),\n", + " 'AM': (44.401689, 0.4646573463122341),\n", + " 'AR': (14.727774, 0.44553059864417316),\n", + " 'AX': (6.272172, 0.4432056139475385),\n", + " 'AY': (0.1068795, 0.45870530538108095),\n", + " 'AZ': (17.89768, 0.4828616963629532),\n", + " 'BC': (11.733246, 0.4689554885298609),\n", + " 'BD ': (7678.43794, 0.4785433967084947),\n", + " 'BN': (19.0674, 0.4789832263550328),\n", + " 'BP': (752.38011, 0.4789832263550328),\n", + " 'BQ': (40.66405, 0.46574855874267257),\n", + " 'BR': (906.218837, 0.47604720080279034),\n", + " 'BZ': (2165.826792, 0.48286169636295323),\n", + " 'CB': (64.134798, 0.48142065877400064),\n", + " 'CC': (0.28939032, 0.47684593482996557),\n", + " 'CD ': (38.878912, 0.4645771635252842),\n", + " 'CF': (2.600055, 0.4493931256642845),\n", + " 'CH': (0.048158, 0.4789832263550328),\n", + " 'CL': (3.341625, 0.47684593482996557),\n", + " 'CR': (0.42195, 0.4645771635252842),\n", + " 'CS': (26.776673, 0.4519701054647674),\n", + " 'CU': (0.408258, 0.45870530538108095),\n", + " 'CW ': (45.734856, 0.47684593482996557),\n", + " 'DA': (31.61424, 0.4323589687324282),\n", + " 'DE': (38.189785, 0.4789832263550328),\n", + " 'DF': (2.976129, 0.4789832263550328),\n", + " 'DH': (0.524736, 0.4689554885298609),\n", + " 'DI': (148.8737475, 0.4630168573674233),\n", + " 'DL': (134.1642, 0.4689554885298609),\n", + " 'DN': (27.275528, 0.4646573463122341),\n", + " 'DV': (2.08236, 0.4645771635252842),\n", + " 'DY': (8.184424, 0.47684593482996557),\n", + " 'EB': (4.926396, 0.4482172507605322),\n", + " 'EE': (2.450848, 0.4806126011483664),\n", + " 'EG': (979.1558875, 0.42561760588538),\n", + " 'EH': (1.521, 0.4323589687324282),\n", + " 'EL': (52.693641, 0.4784120740720739),\n", + " 'EP': (98.5989875, 0.45771848227839446),\n", + " 'EU': (77.304492, 0.45870530538108095),\n", + " 'FC': (34.489056, 0.4519040862043542),\n", + " 'FD ': (31.863879, 0.4519701054647674),\n", + " 'FE': (20522.19526, 0.4389950342373603),\n", + " 'FI': (8.616819, 0.42587247147903806),\n", + " 'FL': (23.9606794, 0.42561760588538),\n", + " 'FR': (2.51111, 0.47075213362693974),\n", + " 'FS': (1.571336, 0.4689554885298609),\n", + " 'GB': (9.888026, 0.47075213362693974),\n", + " 'GE': (218.570828, 0.4645771635252842),\n", + " 'GF': (840.636873, 0.47684593482996557),\n", + " 'GH': (17.767775, 0.44553059864417316),\n", + " 'GI': (90.493248, 0.42561760588538),\n", + " 'GL': (0.047863636, 0.40470758822889663),\n", + " 'EJ': (1, 0.4888023515980042)})" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "second_level_columns = [c for c in (conts + categoricals) if c != \"DU\"]\n", + "left_splits = {col: min_column(left_group, col) for col in second_level_columns}\n", + "right_splits = {col: min_column(right_group, col) for col in second_level_columns}\n", + "left_splits, right_splits" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "26273a6e", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:10.811960Z", + "iopub.status.busy": "2023-07-24T16:45:10.811092Z", + "iopub.status.idle": "2023-07-24T16:45:10.819217Z", + "shell.execute_reply": "2023-07-24T16:45:10.818021Z" + }, + "papermill": { + "duration": 0.037608, + "end_time": "2023-07-24T16:45:10.821589", + "exception": false, + "start_time": "2023-07-24T16:45:10.783981", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(('AB', (0.363205, 0.2205227532918278)),\n", + " ('GL', (0.047863636, 0.40470758822889663)))" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "best_left_split = get_best_split(left_splits)\n", + "best_right_split = get_best_split(right_splits)\n", + "best_left_split, best_right_split" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "e2c06b1a", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:10.874736Z", + "iopub.status.busy": "2023-07-24T16:45:10.873888Z", + "iopub.status.idle": "2023-07-24T16:45:10.893869Z", + "shell.execute_reply": "2023-07-24T16:45:10.892807Z" + }, + "papermill": { + "duration": 0.049636, + "end_time": "2023-07-24T16:45:10.896584", + "exception": false, + "start_time": "2023-07-24T16:45:10.846948", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "model = DecisionTreeClassifier(max_leaf_nodes=4).fit(model_train_x, model_train_y)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "05da0a60", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:10.950149Z", + "iopub.status.busy": "2023-07-24T16:45:10.949300Z", + "iopub.status.idle": "2023-07-24T16:45:10.956395Z", + "shell.execute_reply": "2023-07-24T16:45:10.955404Z" + }, + "papermill": { + "duration": 0.036988, + "end_time": "2023-07-24T16:45:10.958987", + "exception": false, + "start_time": "2023-07-24T16:45:10.921999", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def draw_tree(tree, df, size=10, ratio=0.6, precision=2, **kwargs):\n", + " dot_format = export_graphviz(tree, out_file=None, feature_names=df.columns, filled=True, rounded=True,\n", + " special_characters=True, rotate=False, precision=precision, **kwargs)\n", + " return graphviz.Source(re.sub('Tree {', f'Tree {{ size={size}; ratio={ratio}', dot_format))" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "150a5f26", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:11.016195Z", + "iopub.status.busy": "2023-07-24T16:45:11.015706Z", + "iopub.status.idle": "2023-07-24T16:45:11.021318Z", + "shell.execute_reply": "2023-07-24T16:45:11.020015Z" + }, + "papermill": { + "duration": 0.038888, + "end_time": "2023-07-24T16:45:11.024293", + "exception": false, + "start_time": "2023-07-24T16:45:10.985405", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "#draw_tree(model, model_train_x, size=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "6ec6bbe6", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:11.077510Z", + "iopub.status.busy": "2023-07-24T16:45:11.077074Z", + "iopub.status.idle": "2023-07-24T16:45:11.083150Z", + "shell.execute_reply": "2023-07-24T16:45:11.081749Z" + }, + "papermill": { + "duration": 0.036188, + "end_time": "2023-07-24T16:45:11.086010", + "exception": false, + "start_time": "2023-07-24T16:45:11.049822", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def gini(df, condition):\n", + " actual = df.loc[condition, dependent]\n", + " return 1 - actual.mean()**2 - (1-actual).mean()**2" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "0a5a233d", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:11.140927Z", + "iopub.status.busy": "2023-07-24T16:45:11.140516Z", + "iopub.status.idle": "2023-07-24T16:45:11.151094Z", + "shell.execute_reply": "2023-07-24T16:45:11.149586Z" + }, + "papermill": { + "duration": 0.041471, + "end_time": "2023-07-24T16:45:11.153859", + "exception": false, + "start_time": "2023-07-24T16:45:11.112388", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.16940873380267307, 0.47061524334251614)" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gini(model_train, model_train['DU'] <= 2.28), gini(model_train, model_train['DU'] > 2.28)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "776f15c6", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:11.207742Z", + "iopub.status.busy": "2023-07-24T16:45:11.207279Z", + "iopub.status.idle": "2023-07-24T16:45:11.219968Z", + "shell.execute_reply": "2023-07-24T16:45:11.218553Z" + }, + "papermill": { + "duration": 0.042482, + "end_time": "2023-07-24T16:45:11.222553", + "exception": false, + "start_time": "2023-07-24T16:45:11.180071", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.11612903225806452" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preds = model.predict(model_val_x)\n", + "mean_absolute_error(model_val_y, preds)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "9a94dbfa", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:11.276408Z", + "iopub.status.busy": "2023-07-24T16:45:11.275966Z", + "iopub.status.idle": "2023-07-24T16:45:11.283919Z", + "shell.execute_reply": "2023-07-24T16:45:11.282776Z" + }, + "papermill": { + "duration": 0.037821, + "end_time": "2023-07-24T16:45:11.286249", + "exception": false, + "start_time": "2023-07-24T16:45:11.248428", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "5.549265256402579" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "balanced_logarithmic_loss(model_val_y.to_numpy(), preds)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "8fe100ca", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:11.340440Z", + "iopub.status.busy": "2023-07-24T16:45:11.339472Z", + "iopub.status.idle": "2023-07-24T16:45:11.365194Z", + "shell.execute_reply": "2023-07-24T16:45:11.364004Z" + }, + "papermill": { + "duration": 0.055722, + "end_time": "2023-07-24T16:45:11.367845", + "exception": false, + "start_time": "2023-07-24T16:45:11.312123", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
DecisionTreeClassifier(min_samples_leaf=50)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "DecisionTreeClassifier(min_samples_leaf=50)" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = DecisionTreeClassifier(min_samples_leaf=50)\n", + "model.fit(model_train_x, model_train_y)\n", + "#draw_tree(model, model_train_x, size=12)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "a09cfd81", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:11.424536Z", + "iopub.status.busy": "2023-07-24T16:45:11.423634Z", + "iopub.status.idle": "2023-07-24T16:45:11.435158Z", + "shell.execute_reply": "2023-07-24T16:45:11.434167Z" + }, + "papermill": { + "duration": 0.041775, + "end_time": "2023-07-24T16:45:11.437438", + "exception": false, + "start_time": "2023-07-24T16:45:11.395663", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.12903225806451613, 8.450509680016193)" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preds = model.predict(model_val_x)\n", + "\n", + "mean_absolute_error(model_val_y, preds), balanced_logarithmic_loss(model_val_y.to_numpy(), preds)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "c7718f8a", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:11.493323Z", + "iopub.status.busy": "2023-07-24T16:45:11.492895Z", + "iopub.status.idle": "2023-07-24T16:45:11.500648Z", + "shell.execute_reply": "2023-07-24T16:45:11.499687Z" + }, + "papermill": { + "duration": 0.038846, + "end_time": "2023-07-24T16:45:11.503195", + "exception": false, + "start_time": "2023-07-24T16:45:11.464349", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def get_tree(proportion=0.75):\n", + " n = len(model_train_y)\n", + " indexes = random.choice(n, int(n*proportion))\n", + " return DecisionTreeClassifier(min_samples_leaf=5).fit(model_train_x.iloc[indexes], model_train_y.iloc[indexes]) " + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "960eb86f", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:11.560281Z", + "iopub.status.busy": "2023-07-24T16:45:11.559868Z", + "iopub.status.idle": "2023-07-24T16:45:12.735484Z", + "shell.execute_reply": "2023-07-24T16:45:12.734570Z" + }, + "papermill": { + "duration": 1.207827, + "end_time": "2023-07-24T16:45:12.737879", + "exception": false, + "start_time": "2023-07-24T16:45:11.530052", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[DecisionTreeClassifier(min_samples_leaf=5),\n", + " DecisionTreeClassifier(min_samples_leaf=5),\n", + " DecisionTreeClassifier(min_samples_leaf=5),\n", + " DecisionTreeClassifier(min_samples_leaf=5),\n", + " DecisionTreeClassifier(min_samples_leaf=5)]" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trees = [get_tree() for _ in range(100)]\n", + "trees[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "074c183c", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:12.794020Z", + "iopub.status.busy": "2023-07-24T16:45:12.793553Z", + "iopub.status.idle": "2023-07-24T16:45:12.998165Z", + "shell.execute_reply": "2023-07-24T16:45:12.996879Z" + }, + "papermill": { + "duration": 0.236135, + "end_time": "2023-07-24T16:45:13.001084", + "exception": false, + "start_time": "2023-07-24T16:45:12.764949", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "all_preds = [t.predict(model_val_x) for t in trees]\n", + "avg_preds = np.stack(all_preds).mean(axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "0151ead3", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:13.057911Z", + "iopub.status.busy": "2023-07-24T16:45:13.057470Z", + "iopub.status.idle": "2023-07-24T16:45:13.066854Z", + "shell.execute_reply": "2023-07-24T16:45:13.065709Z" + }, + "papermill": { + "duration": 0.040619, + "end_time": "2023-07-24T16:45:13.069001", + "exception": false, + "start_time": "2023-07-24T16:45:13.028382", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.1665806451612903, 0.4201347888809029)" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_absolute_error(model_val_y, avg_preds), balanced_logarithmic_loss(model_val_y.to_numpy(), avg_preds)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "7485c5b0", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:13.125923Z", + "iopub.status.busy": "2023-07-24T16:45:13.124630Z", + "iopub.status.idle": "2023-07-24T16:45:13.509858Z", + "shell.execute_reply": "2023-07-24T16:45:13.508406Z" + }, + "papermill": { + "duration": 0.417005, + "end_time": "2023-07-24T16:45:13.513124", + "exception": false, + "start_time": "2023-07-24T16:45:13.096119", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.05806451612903226, 5.180816459236603)" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=5)\n", + "rf.fit(model_train_x, model_train_y)\n", + "\n", + "preds = rf.predict(model_val_x)\n", + "\n", + "\n", + "mean_absolute_error(model_val_y, preds), balanced_logarithmic_loss(model_val_y.to_numpy(), preds)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "ab271362", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:13.569934Z", + "iopub.status.busy": "2023-07-24T16:45:13.569468Z", + "iopub.status.idle": "2023-07-24T16:45:13.583324Z", + "shell.execute_reply": "2023-07-24T16:45:13.582374Z" + }, + "papermill": { + "duration": 0.044952, + "end_time": "2023-07-24T16:45:13.585642", + "exception": false, + "start_time": "2023-07-24T16:45:13.540690", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EJ
00
10
20
30
40
\n", + "
" + ], + "text/plain": [ + " EJ\n", + "0 0\n", + "1 0\n", + "2 0\n", + "3 0\n", + "4 0" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_df_n = test_df.copy()\n", + "test_df_n[categoricals] = test_df_n[categoricals].apply(lambda x: x.cat.codes)\n", + "test_df_n[categoricals].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "6b55ebf4", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:13.646524Z", + "iopub.status.busy": "2023-07-24T16:45:13.645977Z", + "iopub.status.idle": "2023-07-24T16:45:13.676183Z", + "shell.execute_reply": "2023-07-24T16:45:13.674914Z" + }, + "papermill": { + "duration": 0.063396, + "end_time": "2023-07-24T16:45:13.678929", + "exception": false, + "start_time": "2023-07-24T16:45:13.615533", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.58356527, 0.41643473],\n", + " [0.58356527, 0.41643473],\n", + " [0.58356527, 0.41643473],\n", + " [0.58356527, 0.41643473],\n", + " [0.58356527, 0.41643473]])" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "columns = conts + categoricals\n", + "test_probs = rf.predict_proba(test_df_n[columns])\n", + "test_probs" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "2175d336", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:13.737857Z", + "iopub.status.busy": "2023-07-24T16:45:13.736621Z", + "iopub.status.idle": "2023-07-24T16:45:13.742994Z", + "shell.execute_reply": "2023-07-24T16:45:13.741783Z" + }, + "papermill": { + "duration": 0.038114, + "end_time": "2023-07-24T16:45:13.745698", + "exception": false, + "start_time": "2023-07-24T16:45:13.707584", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "sample_submission_df[[\"class_0\", \"class_1\"]] = test_probs" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "93798a36", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:13.802864Z", + "iopub.status.busy": "2023-07-24T16:45:13.802409Z", + "iopub.status.idle": "2023-07-24T16:45:13.811331Z", + "shell.execute_reply": "2023-07-24T16:45:13.810078Z" + }, + "papermill": { + "duration": 0.040514, + "end_time": "2023-07-24T16:45:13.814026", + "exception": false, + "start_time": "2023-07-24T16:45:13.773512", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "sample_submission_df.to_csv(\"submission.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "62831b36", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:13.871255Z", + "iopub.status.busy": "2023-07-24T16:45:13.870405Z", + "iopub.status.idle": "2023-07-24T16:45:14.906497Z", + "shell.execute_reply": "2023-07-24T16:45:14.905184Z" + }, + "papermill": { + "duration": 1.067668, + "end_time": "2023-07-24T16:45:14.909118", + "exception": false, + "start_time": "2023-07-24T16:45:13.841450", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "pd.DataFrame(dict(cols=model_train_x.columns, imp=rf.feature_importances_)).plot('cols', 'imp', 'barh', figsize=(8, 20));" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "6086949d", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-24T16:45:14.967939Z", + "iopub.status.busy": "2023-07-24T16:45:14.967498Z", + "iopub.status.idle": "2023-07-24T16:45:14.980540Z", + "shell.execute_reply": "2023-07-24T16:45:14.979376Z" + }, + "papermill": { + "duration": 0.04537, + "end_time": "2023-07-24T16:45:14.982989", + "exception": false, + "start_time": "2023-07-24T16:45:14.937619", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Idclass_0class_1
000eed32682bb0.5835650.416435
1010ebe33f6680.5835650.416435
202fa521e18380.5835650.416435
3040e15f562a20.5835650.416435
4046e85c7cc7f0.5835650.416435
\n", + "
" + ], + "text/plain": [ + " Id class_0 class_1\n", + "0 00eed32682bb 0.583565 0.416435\n", + "1 010ebe33f668 0.583565 0.416435\n", + "2 02fa521e1838 0.583565 0.416435\n", + "3 040e15f562a2 0.583565 0.416435\n", + "4 046e85c7cc7f 0.583565 0.416435" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_submission_df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "papermill": { + "default_parameters": {}, + "duration": 77.807742, + "end_time": "2023-07-24T16:45:16.137577", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2023-07-24T16:43:58.329835", + "version": "2.4.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}