{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "8ced3efa-401b-4b60-bc1b-4e861aecc71e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "D:\\LUTFI\\Anacnda\\Lib\\site-packages\\dask\\dataframe\\_pyarrow_compat.py:23: UserWarning: You are using pyarrow version 11.0.0 which is known to be insecure. See https://www.cve.org/CVERecord?id=CVE-2023-47248 for further details. Please upgrade to pyarrow>=14.0.1 or install pyarrow-hotfix to patch your current version.\n", " warnings.warn(\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler\n", "\n", "from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold\n", "\n", "from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor\n", "from lightgbm import LGBMRegressor\n", "\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score" ] }, { "cell_type": "code", "execution_count": 2, "id": "ffc3b3cd-b9dd-47f8-8827-58aec39a6309", "metadata": {}, "outputs": [], "source": [ "df = pd.read_excel('Real_estate_valuation _data_set.xlsx')" ] }, { "cell_type": "code", "execution_count": 3, "id": "d5857eae-ef7b-4b17-a12b-d2b9e4bdc22e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NoX1 transaction dateX2 house ageX3 distance to the nearest MRT stationX4 number of convenience storesX5 latitudeX6 longitudeY house price of unit area
012012.91666732.084.878821024.98298121.5402437.9
122012.91666719.5306.59470924.98034121.5395142.2
232013.58333313.3561.98450524.98746121.5439147.3
342013.50000013.3561.98450524.98746121.5439154.8
452012.8333335.0390.56840524.97937121.5424543.1
\n", "
" ], "text/plain": [ " No X1 transaction date X2 house age \\\n", "0 1 2012.916667 32.0 \n", "1 2 2012.916667 19.5 \n", "2 3 2013.583333 13.3 \n", "3 4 2013.500000 13.3 \n", "4 5 2012.833333 5.0 \n", "\n", " X3 distance to the nearest MRT station X4 number of convenience stores \\\n", "0 84.87882 10 \n", "1 306.59470 9 \n", "2 561.98450 5 \n", "3 561.98450 5 \n", "4 390.56840 5 \n", "\n", " X5 latitude X6 longitude Y house price of unit area \n", "0 24.98298 121.54024 37.9 \n", "1 24.98034 121.53951 42.2 \n", "2 24.98746 121.54391 47.3 \n", "3 24.98746 121.54391 54.8 \n", "4 24.97937 121.54245 43.1 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "id": "7de11b8e-963d-432f-9246-5595fcdc6461", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NoDateAgeDistance_MRTTotal_SotresLatitudelongitudeprice
012012.91666732.084.878821024.98298121.5402437.9
122012.91666719.5306.59470924.98034121.5395142.2
232013.58333313.3561.98450524.98746121.5439147.3
342013.50000013.3561.98450524.98746121.5439154.8
452012.8333335.0390.56840524.97937121.5424543.1
\n", "
" ], "text/plain": [ " No Date Age Distance_MRT Total_Sotres Latitude longitude \\\n", "0 1 2012.916667 32.0 84.87882 10 24.98298 121.54024 \n", "1 2 2012.916667 19.5 306.59470 9 24.98034 121.53951 \n", "2 3 2013.583333 13.3 561.98450 5 24.98746 121.54391 \n", "3 4 2013.500000 13.3 561.98450 5 24.98746 121.54391 \n", "4 5 2012.833333 5.0 390.56840 5 24.97937 121.54245 \n", "\n", " price \n", "0 37.9 \n", "1 42.2 \n", "2 47.3 \n", "3 54.8 \n", "4 43.1 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns = ['No', 'Date', 'Age', 'Distance_MRT', 'Total_Sotres', 'Latitude', 'longitude', 'price']\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 5, "id": "fe195964-c5b7-420e-8601-21603a6f092a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 414 entries, 0 to 413\n", "Data columns (total 8 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 No 414 non-null int64 \n", " 1 Date 414 non-null float64\n", " 2 Age 414 non-null float64\n", " 3 Distance_MRT 414 non-null float64\n", " 4 Total_Sotres 414 non-null int64 \n", " 5 Latitude 414 non-null float64\n", " 6 longitude 414 non-null float64\n", " 7 price 414 non-null float64\n", "dtypes: float64(6), int64(2)\n", "memory usage: 26.0 KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 6, "id": "753d0b20-6bdb-4ccc-94e9-1f1374d9502b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((331, 5), (83, 5))" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train, X_test, y_train, y_test = train_test_split(\n", " df.drop(['No', 'Date', 'price'], axis=1),\n", " df['price'],\n", " test_size= 0.2,\n", " random_state= 1)\n", "\n", "for i in [X_train, X_test, y_train, y_test]:\n", " i.reset_index(drop=True, inplace=True)\n", "\n", "X_train.shape, X_test.shape" ] }, { "cell_type": "code", "execution_count": 7, "id": "6cf510bb-b381-4db0-ae7a-604c0677039f", "metadata": {}, "outputs": [], "source": [ "target = 'price'" ] }, { "cell_type": "code", "execution_count": 8, "id": "c00c5283-e461-484a-85ad-ecf92b1e8cbb", "metadata": {}, "outputs": [], "source": [ "preprocessor_numerik = Pipeline([\n", " ('imputasi', SimpleImputer(strategy='mean')),\n", " ('scaling', StandardScaler())\n", "])" ] }, { "cell_type": "code", "execution_count": 9, "id": "45d5fa0f-87b1-49be-a974-af1923bdda8c", "metadata": {}, "outputs": [], "source": [ "lr = LinearRegression()" ] }, { "cell_type": "code", "execution_count": 10, "id": "d8e0088d-f50f-4e9e-9ef8-44311579c78d", "metadata": {}, "outputs": [], "source": [ "model_rfr = Pipeline(steps=[\n", " ('preprocessor_numerik', preprocessor_numerik),\n", " ('lr', LinearRegression())\n", "])" ] }, { "cell_type": "code", "execution_count": 11, "id": "440d9f9f-69ba-48d5-a362-e29951a6a771", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('preprocessor_numerik',\n",
       "                 Pipeline(steps=[('imputasi', SimpleImputer()),\n",
       "                                 ('scaling', StandardScaler())])),\n",
       "                ('lr', LinearRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('preprocessor_numerik',\n", " Pipeline(steps=[('imputasi', SimpleImputer()),\n", " ('scaling', StandardScaler())])),\n", " ('lr', LinearRegression())])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model_rfr.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 12, "id": "fdc8e212-c98c-48c6-9130-7cfe1c4c85b1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([42.36886682, 45.64027105, 52.29266724, 47.85322975, 36.37229111,\n", " 48.6702964 , 14.93141372, 33.87830336, 50.24625987, 40.83238223,\n", " 30.49859064, 48.37004246, 41.02486147, 30.02306706, 42.31075147,\n", " 48.68694663, 43.018887 , 33.55347608, 46.22563127, 30.17706976,\n", " 33.50593387, 39.85433842, 42.16108976, 48.58127858, 34.80094567,\n", " 38.57058392, 47.18117225, 43.19310668, 52.31761086, 48.39034417,\n", " 41.99035299, 30.55918299, 46.41177644, 46.74944722, 42.26086423,\n", " 41.74468415, 15.12079501, 43.30531335, 15.25954788, 38.78936601,\n", " 41.59188631, 41.71778768, 43.15499811, 45.4240889 , 46.17574403,\n", " 47.12788362, 40.15448785, 38.5988199 , 48.63116581, 44.90988295,\n", " 46.61340685, 47.80869225, 41.33640095, 42.23592061, 42.31075147,\n", " 43.18050194, 52.24278 , 32.59440108, 12.11104337, 41.57404166,\n", " 25.52390374, 43.12497981, 48.58127858, 38.63804119, 34.88185988,\n", " 48.4199297 , 50.24625987, 44.30833569, 50.59052451, 43.018887 ,\n", " 38.95202107, 43.48870434, 24.17314066, 32.96000097, 34.07440683,\n", " 37.54660931, 50.24625987, 42.77620817, 49.15749083, 50.24625987,\n", " 15.10601904, 45.43093643, 34.85691627, 52.21783639, 46.76306855,\n", " 40.66134485, 41.75515061, 45.51555296, 44.63257306, 29.564124 ,\n", " 30.67319596, 28.96317173, 47.3550611 , 46.97822192, 35.91936611,\n", " 48.50402977, 47.00656693, 25.52140291, 55.02824079, 47.56351028,\n", " 49.60416586, 38.25881687, 44.27422511, 44.60861129, 21.74705316,\n", " 49.0289474 , 41.02870932, 43.50749428, 49.59477177, 44.2440042 ,\n", " 45.61532743, 37.03020189, 42.23592061, 32.03787491, 31.37854073,\n", " 14.90647011, 50.09306532, 40.78097996, 46.3067255 , 44.81932843,\n", " 45.40123786, 30.49859064, 46.79592195, 33.84897425, 35.7180079 ,\n", " 47.80869225, 29.01034861, 28.72708906, 39.51915647, 14.95813635,\n", " 44.79438481, 8.06008617, 47.80869225, 52.26772362, 41.39333093,\n", " 52.44232894, 45.69015828, 41.44786325, 43.36428147, 41.29029531,\n", " 25.54030676, 42.63096076, 33.71831813, 37.12219623, 11.72440659,\n", " 27.63427469, 39.05816358, 45.41797523, 46.27599136, 38.62376352,\n", " 30.48435213, 31.92913805, 37.28357107, 45.42618148, 42.21920511,\n", " 38.36378677, 45.69440684, 40.41059826, 47.83363586, 32.32744468,\n", " 45.69015828, 9.59230461, 33.9596696 , 31.2187193 , 46.17574403,\n", " 43.59583988, 40.69807579, 50.41225954, 37.850607 , 48.4042553 ,\n", " 36.14274813, 42.60954866, 45.84398876, 45.17474763, 40.19023373,\n", " 48.27026799, 32.0129313 , 45.29937081, 43.85615123, 36.53482197,\n", " 23.22432109, 38.1440882 , 39.81826002, 2.60500519, 37.28397053,\n", " 30.1670043 , 42.22381852, 50.09306532, 14.8916758 , 43.22629538,\n", " 42.71298657, 36.48338363, 46.22563127, 32.30099697, 51.88100313,\n", " 38.69818738, 46.62193807, 45.85682942, 32.83323107, 43.018887 ,\n", " 12.25831833, 26.41259239, 46.25057488, 14.8991769 , 14.23656569,\n", " 40.98340745, 32.51221627, 34.60492182, 45.85682942, 44.78386047,\n", " 45.05003285, 25.21885406, 33.12828125, 37.04911653, 47.28142775,\n", " 46.22563127, 13.71602892, 35.52616731, 30.54847787, 49.75480868,\n", " 34.91127456, 33.17903829, 52.26772362, 47.90311699, 34.19818489,\n", " 47.15622863, 32.42426584, 54.17874571, 32.24139297, 41.00835106,\n", " 34.233986 , 27.65952669, 30.47364702, 32.36565073, 30.65895745,\n", " 34.9068035 , 26.51236686, 43.45004022, 31.88896929, 46.92833468,\n", " 33.68055385, 46.30046212, 29.38972347, 33.15409468, 43.65715598,\n", " 14.46135214, 53.03558127, 33.34343735, 30.10223891, 42.90491934,\n", " 54.25357656, 46.05593105, 41.71660439, 14.90647011, 33.05432021,\n", " 42.16108976, 8.88637881, 34.09935045, 46.25122495, 46.35661273,\n", " 50.24625987, 26.51953997, 30.7337883 , 15.25954788, 39.60771879,\n", " 36.54236185, 39.25656155, 42.31897958, 14.58295082, 38.52786553,\n", " 43.02107389, 32.37496591, 14.58607022, 33.32974487, 38.89565721,\n", " 32.34070711, 36.64551078, 36.23607094, 34.88185988, 39.60806735,\n", " 46.17574403, 48.7558839 , 36.61556644, 43.04768545, 47.57748049,\n", " 46.92833468, 53.81962905, 12.57057948, 41.67597908, 12.03621252,\n", " 42.16108976, 39.27535277, 43.018887 , 32.76304058, 40.4377969 ,\n", " 43.47923892, 45.62214993, 49.52155362, 45.5654402 , 29.87500021,\n", " 40.20166259, 35.66740059, 45.98900716, 39.40297485, 32.42485315,\n", " 28.59929244, 45.52237546, 45.97392447, 42.60214485, 40.97535553,\n", " 25.11907959, 52.39244171, 51.29462089, 49.90447038, 38.28376049,\n", " 44.45894959, 53.0854685 , 43.65312151, 38.73947878, 36.24252259,\n", " 43.02368574, 30.57342149, 40.27550731, 49.89341066, 46.37529297,\n", " 43.64572711, 8.1099734 , 43.57063735, 30.97876525, 45.7219244 ,\n", " 30.54847787])" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model_rfr.predict(X_train)" ] }, { "cell_type": "code", "execution_count": 13, "id": "81805032-a2b8-41ac-b435-83ec09dbdf1b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "38.71299093655648\n", "[-2.85545772 -5.80314009 3.25790289 2.9053199 -0.525551 ]\n" ] } ], "source": [ "print(model_rfr['lr'].intercept_)\n", "print(model_rfr['lr'].coef_)" ] }, { "cell_type": "code", "execution_count": 14, "id": "0be85203-cc12-42a1-b9a5-2075bfb9212b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Age' 'Distance_MRT' 'Total_Sotres' 'Latitude' 'longitude']\n" ] } ], "source": [ "print(model_rfr['preprocessor_numerik'].get_feature_names_out())" ] }, { "cell_type": "code", "execution_count": 15, "id": "425b2679-ea37-49a0-ae29-476b7258a9a7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "price = 38.71299093655648 + -2.8554577169465 * Age +\n", "-5.803140091732102 * Distance_MRT +\n", "3.2579028917683615 * Total_Sotres +\n", "2.905319902108692 * Latitude +\n", "-0.5255509950918564 * longitude\n" ] } ], "source": [ "# Looping untuk mencetak persamaan regresi linear\n", "for i,j in enumerate(model_rfr['lr'].coef_):\n", " if i == 0:\n", " print(str(target),'=',model_rfr['lr'].intercept_,'+',model_rfr['lr'].coef_[i],'*',model_rfr['preprocessor_numerik'].get_feature_names_out()[i],'+')\n", " elif i != len(model_rfr['lr'].coef_)-1:\n", " print(model_rfr['lr'].coef_[i],'*',model_rfr['preprocessor_numerik'].get_feature_names_out()[i],'+')\n", " else:\n", " print(model_rfr['lr'].coef_[i],'*',model_rfr['preprocessor_numerik'].get_feature_names_out()[i])" ] }, { "cell_type": "code", "execution_count": 16, "id": "7a575181-f99a-4405-b6ed-399c824366fc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "price = 38.71299093655648 + -2.8554577169465 * Age + -5.803140091732102 * Distance_MRT + 3.2579028917683615 * Total_Sotres + 2.905319902108692 * Latitude + -0.5255509950918564 * longitude\n" ] } ], "source": [ "# Menulis secara runut model regresi secara manual\n", "print('price = ', model_rfr['lr'].intercept_,'+',\n", " model_rfr['lr'].coef_[0],'*',model_rfr['preprocessor_numerik'].get_feature_names_out()[0],'+',\n", " model_rfr['lr'].coef_[1],'*',model_rfr['preprocessor_numerik'].get_feature_names_out()[1],'+',\n", " model_rfr['lr'].coef_[2],'*',model_rfr['preprocessor_numerik'].get_feature_names_out()[2],'+',\n", " model_rfr['lr'].coef_[3],'*',model_rfr['preprocessor_numerik'].get_feature_names_out()[3],'+',\n", " model_rfr['lr'].coef_[4],'*',model_rfr['preprocessor_numerik'].get_feature_names_out()[4])" ] }, { "cell_type": "markdown", "id": "54f1c07c-6060-4765-b752-0c0a87167e2d", "metadata": {}, "source": [ "Dengan melihay rumus diatas kita, para pebisnis Real estate bisa menggunakan rumus diatas dengancara mengisi setiap featuresnya." ] }, { "cell_type": "markdown", "id": "514dc104-9d23-470b-b6ac-e37351f3e844", "metadata": {}, "source": [ "
" ] }, { "cell_type": "markdown", "id": "07ae22fc-3960-4864-b200-6864b7f4e663", "metadata": {}, "source": [ "### Evaluasi Model" ] }, { "cell_type": "code", "execution_count": 17, "id": "a6543e2f-4130-4319-9de9-e8b44ed37439", "metadata": {}, "outputs": [], "source": [ "#memprediksi X_train dan membandingkannya dengan y_train\n", "predik_train = model_rfr.predict(X_train)" ] }, { "cell_type": "code", "execution_count": 18, "id": "c735c2af-80c8-4536-86d2-cfc9c3e5fe2d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
price
042.368867
145.640271
252.292667
347.853230
436.372291
\n", "
" ], "text/plain": [ " price\n", "0 42.368867\n", "1 45.640271\n", "2 52.292667\n", "3 47.853230\n", "4 36.372291" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#melihat hasil prediksi training \n", "predik_train = pd.DataFrame(predik_train, columns=[target])\n", "predik_train.head()" ] }, { "cell_type": "code", "execution_count": 19, "id": "d3a7528d-046a-47e5-9f6d-c6039933a1c2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Prediksi X_trainAsliEror
042.36886737.44.968867
145.64027151.86.159729
252.29266758.15.807333
347.85323049.51.646770
436.37229130.65.772291
\n", "
" ], "text/plain": [ " Prediksi X_train Asli Eror\n", "0 42.368867 37.4 4.968867\n", "1 45.640271 51.8 6.159729\n", "2 52.292667 58.1 5.807333\n", "3 47.853230 49.5 1.646770\n", "4 36.372291 30.6 5.772291" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Membandingkan 2 hasil prediksi dan asli\n", "compareXy_train = pd.DataFrame(np.column_stack((predik_train, pd.DataFrame(y_train), abs(predik_train - pd.DataFrame(y_train)))), columns=['Prediksi X_train', 'Asli', 'Eror'])\n", "compareXy_train.head()\n", " " ] }, { "cell_type": "markdown", "id": "b783d750-588c-4a2a-9584-b4cc4dc26dda", "metadata": {}, "source": [ "kolom bertuliskan eror merupakan selisih antara prediksi ke nilai asli " ] }, { "cell_type": "code", "execution_count": 20, "id": "ce43d8c8-b069-4936-9217-9e40af307d29", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Prediksi X_trainAsliEror
count331.000000331.000000331.000000
mean38.71299138.7129916.468697
std10.12529613.8144146.807759
min2.6050057.6000000.025631
25%33.16656628.4500002.395571
50%41.44786339.3000005.058957
75%45.98146647.3000008.315330
max55.028241117.50000075.509647
\n", "
" ], "text/plain": [ " Prediksi X_train Asli Eror\n", "count 331.000000 331.000000 331.000000\n", "mean 38.712991 38.712991 6.468697\n", "std 10.125296 13.814414 6.807759\n", "min 2.605005 7.600000 0.025631\n", "25% 33.166566 28.450000 2.395571\n", "50% 41.447863 39.300000 5.058957\n", "75% 45.981466 47.300000 8.315330\n", "max 55.028241 117.500000 75.509647" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "compareXy_train.describe()" ] }, { "cell_type": "markdown", "id": "22ba490d-183e-47ee-a4cf-04cefc49e0d7", "metadata": {}, "source": [ "nilai MAE adalah 6.03" ] }, { "cell_type": "code", "execution_count": 21, "id": "f319564f-31a1-42de-92bd-88d13fc0cbca", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.scatter(predik_train, y_train, color = 'red')\n", "plt.plot(predik_train, model_rfr.predict(X_train), color = 'blue')\n", "plt.title('Prediksi vs Aktual (Training Set)')\n", "plt.xlabel('Prediksi')\n", "plt.ylabel('Aktual')\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "fea197cb-d29a-4782-8505-d79ac9de0a17", "metadata": {}, "source": [ "membandingkan prediksi X_train dan y_train tampaknya sangat bagus, namun perlu menghitung nilai eror sekaligus coeficient." ] }, { "cell_type": "markdown", "id": "4be13d82-a298-408f-b92a-fcd95249e3b2", "metadata": {}, "source": [ "
" ] }, { "cell_type": "markdown", "id": "01c73dda-8087-456d-834d-75cabb92feab", "metadata": {}, "source": [ "### Menghitung *Metrics* di *Training Set*" ] }, { "cell_type": "code", "execution_count": 22, "id": "63963a93-02fe-4618-b7a4-80ffd0193e64", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MAE = 6.47\n", "MSE = 88.05\n", "RMSE = 9.38\n", "R2 = 0.54\n" ] } ], "source": [ "# Menghitung nilai error (MAE, MSE, RMSE) di Training Set\n", "from sklearn import metrics\n", "print('MAE = {0:.2f}'.format(metrics.mean_absolute_error(y_train, predik_train))) # MAE adalah nilai error rata-rata seperti yang ada di tabel banding_train\n", "print('MSE = {0:.2f}'.format(metrics.mean_squared_error(y_train, predik_train))) # penulisan {0:.2f} adalah untuk menuliskan 2 angka di belakang koma\n", "print('RMSE = {0:.2f}'.format(np.sqrt(metrics.mean_squared_error(y_train, predik_train))))\n", "print('R2 = {0:.2f}'.format(metrics.r2_score(y_train, predik_train)))" ] }, { "cell_type": "markdown", "id": "0381e21e-d02f-41ae-8c3d-f7078b4a6ca7", "metadata": {}, "source": [ "
" ] }, { "cell_type": "markdown", "id": "12209067-2739-49cd-be1a-d1af06eeee0c", "metadata": {}, "source": [ "### Evaluasi di Test Set" ] }, { "cell_type": "code", "execution_count": 23, "id": "ddc0b4cd-e25b-4dec-974c-e3db25c17188", "metadata": {}, "outputs": [], "source": [ "predik_test = model_rfr.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 24, "id": "b237b634-557b-4d5a-a943-71c66ece358b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
price
033.079962
147.808692
222.113738
315.130963
444.625942
\n", "
" ], "text/plain": [ " price\n", "0 33.079962\n", "1 47.808692\n", "2 22.113738\n", "3 15.130963\n", "4 44.625942" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predik_test = pd.DataFrame(predik_test, columns=[target])\n", "predik_test.head()" ] }, { "cell_type": "code", "execution_count": 25, "id": "96aeb226-e964-4cdb-acf6-06cb1fff3efb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Prediksi X_testAsliEror
033.07996227.35.779962
147.80869254.46.591308
222.11373822.00.113738
315.13096311.63.530963
444.62594245.40.774058
............
7841.62609233.18.526092
7946.12585759.613.474143
8044.36255040.83.562550
8143.01888752.29.181113
8248.92699547.11.826995
\n", "

83 rows × 3 columns

\n", "
" ], "text/plain": [ " Prediksi X_test Asli Eror\n", "0 33.079962 27.3 5.779962\n", "1 47.808692 54.4 6.591308\n", "2 22.113738 22.0 0.113738\n", "3 15.130963 11.6 3.530963\n", "4 44.625942 45.4 0.774058\n", ".. ... ... ...\n", "78 41.626092 33.1 8.526092\n", "79 46.125857 59.6 13.474143\n", "80 44.362550 40.8 3.562550\n", "81 43.018887 52.2 9.181113\n", "82 48.926995 47.1 1.826995\n", "\n", "[83 rows x 3 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "compareXX_test = pd.DataFrame(np.column_stack((predik_test, pd.DataFrame(y_test), abs(predik_test - pd.DataFrame(y_test)))), columns=['Prediksi X_test', 'Asli', 'Eror'])\n", "compareXX_test" ] }, { "cell_type": "code", "execution_count": 26, "id": "508c6985-e77c-45d1-a9e7-5b542681f6e0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Prediksi X_testAsliEror
count83.00000083.00000083.000000
mean37.11849135.0578315.380155
std11.00534112.3950434.084077
min12.27568611.6000000.113738
25%33.21170023.8000002.231576
50%40.50311436.9000004.200462
75%44.62594242.9000008.163311
max52.58815159.60000017.777806
\n", "
" ], "text/plain": [ " Prediksi X_test Asli Eror\n", "count 83.000000 83.000000 83.000000\n", "mean 37.118491 35.057831 5.380155\n", "std 11.005341 12.395043 4.084077\n", "min 12.275686 11.600000 0.113738\n", "25% 33.211700 23.800000 2.231576\n", "50% 40.503114 36.900000 4.200462\n", "75% 44.625942 42.900000 8.163311\n", "max 52.588151 59.600000 17.777806" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "compareXX_test.describe()" ] }, { "cell_type": "markdown", "id": "3cb48f8a-8784-4fef-b19a-21823a72cd64", "metadata": {}, "source": [ "Nimali MAE di test set adalah 6.5" ] }, { "cell_type": "code", "execution_count": 27, "id": "f629bbe5-de60-4189-9f84-4eebf39d3b86", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Visualisasi hasil Test Set\n", "plt.scatter(predik_test, y_test, color = 'red')\n", "plt.plot(predik_test, model_rfr.predict(X_test), color = 'blue')\n", "plt.title('Prediksi vs Aktual (Test Set)')\n", "plt.xlabel('Prediksi')\n", "plt.ylabel('Aktual')\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "25fdef67-50d9-46b9-a70a-b766c10c17d0", "metadata": {}, "source": [ "dari visual diatas, terlihat jelas bahwa nilai prediksi test set dngan nilai asli hapir sama, itu ditandai dengan titik merah sangat dekat dengan garis biru, namun ada juga yang sangat jauh, itu menandakan oulier" ] }, { "cell_type": "markdown", "id": "a34fcd1b-216c-4f40-bd4e-fc0013dd9014", "metadata": {}, "source": [ "
" ] }, { "cell_type": "markdown", "id": "cf899d16-3aac-4766-8fa6-12dea1f7bdb8", "metadata": {}, "source": [ "Menghitung Metric hasil dari test set" ] }, { "cell_type": "code", "execution_count": 28, "id": "7d5ad535-20c1-48f7-b877-043ac653b6e5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MAE = 5.38\n", "MSE = 45.42\n", "RMSE = 6.74\n", "R2 = 0.70\n" ] } ], "source": [ "# Menghitung nilai error (MAE, MSE, RMSE) di Test Set\n", "print('MAE = {0:.2f}'.format(metrics.mean_absolute_error(y_test, predik_test))) # MAE adalah nilai error rata-rata seperti yang ada di tabel banding_train\n", "print('MSE = {0:.2f}'.format(metrics.mean_squared_error(y_test, predik_test))) # penulisan {0:.2f} adalah untuk menuliskan 2 angka di belakang koma\n", "print('RMSE = {0:.2f}'.format(np.sqrt(metrics.mean_squared_error(y_test, predik_test))))\n", "print('R2 = {0:.2f}'.format(metrics.r2_score(y_test, predik_test)))" ] }, { "cell_type": "markdown", "id": "25442c62-caa1-4f10-aa26-eb9723cba1a8", "metadata": {}, "source": [ "bisa dilihat dari hasil X_train dan X_test dari 0.65 dan 0.70 sangat rendah hampir 50% dari sini peneliti masih kurang puas dengan hasil, untuk itu peneliti akan mencoba lagi dengan pemilihan model terbaik, cross validasi dan hyper paramter tuning." ] }, { "cell_type": "markdown", "id": "32886b77-425d-49f4-b99d-5eb9bd6ba84b", "metadata": {}, "source": [ "
" ] }, { "cell_type": "code", "execution_count": 29, "id": "c0ee7d9f-5b7f-4ad6-90e7-85dda37779ca", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((331, 5), (83, 5))" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train, X_test, y_train, y_test = train_test_split(\n", " df.drop(['No', 'Date', 'price'], axis=1),\n", " df['price'],\n", " test_size= 0.2,\n", " random_state= 1)\n", "\n", "for i in [X_train, X_test, y_train, y_test]:\n", " i.reset_index(drop=True, inplace=True)\n", "\n", "X_train.shape, X_test.shape" ] }, { "cell_type": "code", "execution_count": 30, "id": "436eb2b1-2ba3-4206-b26a-14e8472d9bd9", "metadata": {}, "outputs": [], "source": [ "preprocessor_numerik = Pipeline([\n", " ('imputasi', SimpleImputer(strategy='mean')),\n", " ('scaling', StandardScaler())\n", "])" ] }, { "cell_type": "markdown", "id": "71dfb628-a6ca-43b0-9687-b62920425850", "metadata": {}, "source": [ "
" ] }, { "cell_type": "code", "execution_count": 31, "id": "025bfdda-5353-4b4d-8ff9-f7e9f179733f", "metadata": {}, "outputs": [], "source": [ "# Model regresi Linear\n", "linreg = LinearRegression()\n", "\n", "# Pipeline model regresi linear\n", "mod_linreg = Pipeline([\n", " ('preprocessor_numerik', preprocessor_numerik),\n", " ('linear', linreg)\n", "])\n", "\n", "# Hyperparameter tuning Linear Regression\n", "param_linreg = {}" ] }, { "cell_type": "markdown", "id": "a24143e5-8387-416f-b21e-28acd4154f42", "metadata": {}, "source": [ "
" ] }, { "cell_type": "code", "execution_count": 32, "id": "3cd0ae91-0c65-453e-be0a-3855023d647d", "metadata": {}, "outputs": [], "source": [ "# Model regresi Lasso\n", "lasso = Lasso(alpha=0.001, random_state=0)\n", "\n", "# Pipeline model regresi lasso\n", "mod_lasso = Pipeline([\n", " ('preprocessor_numerikg', preprocessor_numerik),\n", " ('lasso', lasso)\n", "])\n", "\n", "# Hyperparameter tuning regresi Lasso\n", "param_lasso = {'lasso__alpha': np.arange(0.01, 1.0, 0.01)}" ] }, { "cell_type": "markdown", "id": "3eea9575-a218-469e-a9ec-0bf922e5d3ea", "metadata": {}, "source": [ "
" ] }, { "cell_type": "code", "execution_count": 33, "id": "432dde8e-acd9-4478-b5dd-f31358ea367a", "metadata": {}, "outputs": [], "source": [ "# Model regresi Ridge\n", "ridge = Ridge(alpha=0.5)\n", "\n", "# Pipeline model regresi Ridge\n", "mod_ridge = Pipeline([\n", " ('preprocessor_numerik', preprocessor_numerik),\n", " ('ridge', ridge)\n", "])\n", "\n", "# Hyperparameter tuning regresi Lasso\n", "param_ridge = {'ridge__alpha': np.arange(0.01, 1.0, 0.01),\n", " 'ridge__solver': ['auto','svd', 'lsqr']\n", " }" ] }, { "cell_type": "markdown", "id": "7f552dff-18b3-43ab-98f3-9bcf5c7f97d3", "metadata": {}, "source": [ "
" ] }, { "cell_type": "code", "execution_count": 34, "id": "d0f2e3fa-299a-439e-a2a1-70e66635ebde", "metadata": {}, "outputs": [], "source": [ "# Model regresi ElasticNet\n", "enet = ElasticNet(random_state=0)\n", "\n", "# Pipeline model regresi Elastic Net\n", "mod_enet = Pipeline([\n", " ('preprocessor_numerik', preprocessor_numerik),\n", " ('enet', enet)\n", "])\n", "\n", "# Hyperparameter tuning Elastic Net\n", "param_enet = {\n", " 'enet__alpha': [0.01, 0.1, 0.5],\n", " 'enet__l1_ratio': [0.01, 0.1, 0.5, 0.9, 1]\n", " }" ] }, { "cell_type": "markdown", "id": "77de2a4e-c787-40c8-9bf9-fc7aa25e2eaf", "metadata": {}, "source": [ "
" ] }, { "cell_type": "code", "execution_count": 35, "id": "0ae77193-f6c9-4ae3-89d2-3b7be253286c", "metadata": {}, "outputs": [], "source": [ "# Model regresi Decision Tree\n", "dt = DecisionTreeRegressor(random_state=0)\n", "\n", "# Pipeline model regresi Decision Tree\n", "mod_dt = Pipeline([\n", " ('preprocessor_numerik', preprocessor_numerik),\n", " ('dt', dt)\n", "])\n", "\n", "# Hyperparameter tuning regresi Decision Tree\n", "param_dt = {\n", " 'dt__splitter': ['best','random'],\n", " 'dt__max_depth': np.arange(1,10), \n", " 'dt__min_samples_split': np.arange(2,10),\n", " 'dt__min_samples_leaf': np.arange(1,5)\n", " }" ] }, { "cell_type": "markdown", "id": "5bdee382-37ee-427e-9cc2-400f3d5596d6", "metadata": {}, "source": [ "
" ] }, { "cell_type": "code", "execution_count": 36, "id": "c65a4a44-ea0b-41e8-a997-d6c3762ce9b0", "metadata": {}, "outputs": [], "source": [ "# Model regresi Random Forest\n", "rf = RandomForestRegressor(random_state=0)\n", "\n", "# Pipeline model regresi Random Forest\n", "mod_rf = Pipeline([\n", " ('preprocessor_numerik', preprocessor_numerik),\n", " ('rf', rf)\n", "])\n", "\n", "# Hyperparameter tuning regresi Random Forest\n", "param_rf = {\n", " 'rf__criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'], \n", " 'rf__min_samples_split': np.arange(2,10),\n", " 'rf__min_samples_leaf': np.arange(1,5)\n", " }" ] }, { "cell_type": "markdown", "id": "66f38cd9-0130-4dd9-8ed9-f0cbcf235100", "metadata": {}, "source": [ "
" ] }, { "cell_type": "code", "execution_count": 37, "id": "c89cecb6-bab3-4c31-a0c3-e5f622c1abaa", "metadata": {}, "outputs": [], "source": [ "# Model regresi Gradient Boosting\n", "gboost = GradientBoostingRegressor(loss='huber',criterion='squared_error',random_state=0)\n", "\n", "# Pipeline model regresi Gradient Boosting\n", "mod_gboost = Pipeline([\n", " ('preprocessor_numerik', preprocessor_numerik),\n", " ('gboost', gboost)\n", " ])\n", "\n", "# Hyperparameter tuning Gradient Boosting\n", "param_gboost = {\n", " 'gboost__n_estimators':[200, 250, 350, 400], #default 100\n", " 'gboost__learning_rate':[0.75, 0.1, 1.25], #default 0.1\n", " 'gboost__max_depth':[2], #default 3\n", " 'gboost__min_samples_split':[5, 6, 7, 8], #default 2\n", " 'gboost__min_samples_leaf':[1, 2, 3] # default 1\n", " }" ] }, { "cell_type": "markdown", "id": "c50dc4f0-af06-41d2-9011-c50bb6065293", "metadata": {}, "source": [ "Sekarang kita akan mentraining semua model di atas dan mengevaluasinya langsung menggunakan 3-fold cross validation." ] }, { "cell_type": "code", "execution_count": 38, "id": "ef048a5b-82b4-423f-a2f9-1a276ba062a5", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 3.3s finished\n", "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 1.5s finished\n", "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 0.0s finished\n", "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 0.0s finished\n", "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 0.0s finished\n", "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 0.0s finished\n", "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 0.0s finished\n", "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 0.0s finished\n", "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 0.0s finished\n", "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 0.0s finished\n", "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 0.6s finished\n", "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 0.4s finished\n", "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 0.4s finished\n", "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n", "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 0.3s finished\n" ] } ], "source": [ "# Model training dengan cross validation\n", "daftar_model = [mod_linreg, mod_lasso, mod_ridge, mod_enet, mod_dt, mod_rf, mod_gboost]\n", "daftar_nama_model = ['linreg', 'lasso', 'ridge', 'elastic', 'dt', 'rf', 'gboost']\n", "mean_mae = []\n", "mean_r2 = []\n", "std_mae = []\n", "std_r2 = []\n", "test_score_mae = []\n", "test_score_r2 = []\n", "\n", "# Setting parameter jika permasalahannya adalah klasifikasi\n", "# skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)\n", "# tentukan cv = skf jika ingin stratified cross validation\n", "\n", "# Looping untuk setiap model yang sudah disiapkan \n", "for i in daftar_model:\n", " \n", " # Melakukan cross validation dan menggunakan kriteria berdasarkan skor MAE dan R2\n", " cv_mae = -cross_val_score(i, X_train, y_train, cv=3, scoring='neg_mean_absolute_error', verbose=1, n_jobs=-1) # kita kalikan dengan -1 karena scoring menggunakan nilai negatif\n", " cv_r2 = cross_val_score(i, X_train, y_train, cv=3, scoring='r2', verbose=1, n_jobs=-1) # n_jobs=-1 artinya kita gunakan semua prerocessor\n", " \n", " # Menghitung nilai rata-rata MAE dan R2 dan menambahkannya ke variabel mean_mae dan mean_r2\n", " mean_mae.append(round(cv_mae.mean(),2)) # round(nilai,2) untuk membulatkan nilai 2 angka di belakang koma\n", " mean_r2.append(round(cv_r2.mean(),2))\n", " \n", " # Menghitung nilai standar deviasi MAE dan R2 dan menambahkannya ke variabel std_mae dan std_r2\n", " std_mae.append(round(cv_mae.std(),2))\n", " std_r2.append(round(cv_r2.std(),2))\n", " \n", " # Melakukan fitting training set kemudian melakukan prediksi di test set\n", " i.fit(X_train, y_train)\n", " i_predict = i.predict(X_test)\n", " \n", " # Menghitung nilai rata-rata MAE dan R2 di test set dan menambahkannya ke variabel test_score_mae dan test_core_r2\n", " test_score_mae.append(round(mean_absolute_error(y_test, i_predict),2))\n", " test_score_r2.append(round(r2_score(y_test, i_predict),2))" ] }, { "cell_type": "code", "execution_count": 39, "id": "a52c3f8e-f293-4600-b4b4-5d89bed282f2", "metadata": {}, "outputs": [], "source": [ "# Membuat DataFrame\n", "cv_mae = pd.DataFrame({'model':daftar_nama_model, 'Train_Mean':mean_mae, 'std':std_mae, 'Test_Score':test_score_mae})\n", "cv_r2 = pd.DataFrame({'model':daftar_nama_model, 'Train_Mean':mean_r2, 'std':std_r2, 'Test_Score':test_score_r2})" ] }, { "cell_type": "code", "execution_count": 40, "id": "475df902-1956-44f1-815f-323339ce244d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelTrain_MeanstdTest_Score
0linreg6.850.885.38
1lasso6.850.885.38
2ridge6.850.885.38
3elastic7.191.115.80
4dt6.521.115.60
5rf5.070.575.37
6gboost5.020.585.25
\n", "
" ], "text/plain": [ " model Train_Mean std Test_Score\n", "0 linreg 6.85 0.88 5.38\n", "1 lasso 6.85 0.88 5.38\n", "2 ridge 6.85 0.88 5.38\n", "3 elastic 7.19 1.11 5.80\n", "4 dt 6.52 1.11 5.60\n", "5 rf 5.07 0.57 5.37\n", "6 gboost 5.02 0.58 5.25" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cv_mae" ] }, { "cell_type": "code", "execution_count": 41, "id": "ee2e5334-b984-4637-a273-765b7ee64136", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelTrain_MeanstdTest_Score
0gboost5.020.585.25
1rf5.070.575.37
2dt6.521.115.60
3linreg6.850.885.38
4lasso6.850.885.38
5ridge6.850.885.38
6elastic7.191.115.80
\n", "
" ], "text/plain": [ " model Train_Mean std Test_Score\n", "0 gboost 5.02 0.58 5.25\n", "1 rf 5.07 0.57 5.37\n", "2 dt 6.52 1.11 5.60\n", "3 linreg 6.85 0.88 5.38\n", "4 lasso 6.85 0.88 5.38\n", "5 ridge 6.85 0.88 5.38\n", "6 elastic 7.19 1.11 5.80" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Mengurutkan skor dari kecil ke besar\n", "cv_mae_urut = cv_mae.sort_values(by=['Train_Mean','Test_Score'], ignore_index=True)\n", "cv_mae_urut" ] }, { "cell_type": "code", "execution_count": 42, "id": "37be7061-cc95-4abd-aa30-90fbb8bdf958", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelTrain_MeanstdTest_Score
0linreg0.490.110.70
1lasso0.490.110.70
2ridge0.490.110.70
3elastic0.470.120.66
4dt0.380.240.57
5rf0.670.060.61
6gboost0.660.060.65
\n", "
" ], "text/plain": [ " model Train_Mean std Test_Score\n", "0 linreg 0.49 0.11 0.70\n", "1 lasso 0.49 0.11 0.70\n", "2 ridge 0.49 0.11 0.70\n", "3 elastic 0.47 0.12 0.66\n", "4 dt 0.38 0.24 0.57\n", "5 rf 0.67 0.06 0.61\n", "6 gboost 0.66 0.06 0.65" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cv_r2" ] }, { "cell_type": "code", "execution_count": 43, "id": "83e3e6a0-ae39-44df-bb64-43826c083f44", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelTrain_MeanstdTest_Score
0rf0.670.060.61
1gboost0.660.060.65
2linreg0.490.110.70
3lasso0.490.110.70
4ridge0.490.110.70
5elastic0.470.120.66
6dt0.380.240.57
\n", "
" ], "text/plain": [ " model Train_Mean std Test_Score\n", "0 rf 0.67 0.06 0.61\n", "1 gboost 0.66 0.06 0.65\n", "2 linreg 0.49 0.11 0.70\n", "3 lasso 0.49 0.11 0.70\n", "4 ridge 0.49 0.11 0.70\n", "5 elastic 0.47 0.12 0.66\n", "6 dt 0.38 0.24 0.57" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Mengurutkan skor dari besar ke kecil\n", "cv_r2_urut = cv_r2.sort_values(by=['Train_Mean','Test_Score'], ascending=False, ignore_index=True)\n", "cv_r2_urut" ] }, { "cell_type": "markdown", "id": "075b952b-8732-4d3f-9ee9-a76cf22434ef", "metadata": {}, "source": [ "Bisa dilihat pada skor di atas bahwa model terbaik berdasarkan skor MAE terendah adalah menggunakan gradient boost, namun untuk R2 tertinggi oleh random forest.\n", "\n", "Selain itu kita bisa lihat juga performa di *test set* lebih baik daripada di *training set*, artinya modelnya sudah fit." ] }, { "cell_type": "markdown", "id": "930bdddc-41d3-49d9-acea-1d855832af53", "metadata": {}, "source": [ "
" ] }, { "cell_type": "markdown", "id": "e1846257-b1d8-4b7c-bd8d-0912866a102c", "metadata": {}, "source": [ "untuk lebih lebih optimal menggunakan hyper parameter tuning" ] }, { "cell_type": "code", "execution_count": 44, "id": "1457ece4-7575-4b32-8da5-c3d41f585c83", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting 5 folds for each of 1 candidates, totalling 5 fits\n", "Fitting 5 folds for each of 99 candidates, totalling 495 fits\n", "Fitting 5 folds for each of 297 candidates, totalling 1485 fits\n", "Fitting 5 folds for each of 15 candidates, totalling 75 fits\n", "Fitting 5 folds for each of 576 candidates, totalling 2880 fits\n", "Fitting 5 folds for each of 128 candidates, totalling 640 fits\n", "Fitting 5 folds for each of 144 candidates, totalling 720 fits\n" ] } ], "source": [ "# Model training dengan GridSearchCV\n", "daftar_model = [mod_linreg, mod_lasso, mod_ridge, mod_enet, mod_dt, mod_rf, mod_gboost]\n", "daftar_nama_model = ['linreg', 'lasso', 'ridge', 'elastic', 'dt', 'rf', 'gboost']\n", "daftar_param_model = [param_linreg, param_lasso, param_ridge, param_enet, param_dt, \n", " param_rf, param_gboost]\n", "\n", "# Nilai yang akan diisikan\n", "mae_tuning = []\n", "mae_tuning_test = []\n", "r2_tuning = []\n", "r2_tuning_test = []\n", "best_param = []\n", "best_estimator = []\n", "\n", "for i in range(len(daftar_model)):\n", " \n", " # Menjalankan GridSearchCV\n", " model_grid_cv = GridSearchCV(\n", " daftar_model[i],\n", " daftar_param_model[i],\n", " cv=5,\n", " verbose=1,\n", " n_jobs=-1\n", " ).fit(X_train,y_train)\n", " \n", " # Mencoba memprediksi training dan test set setelah fitting di training set, kemudian dikemas dalam format DataFrame\n", " pred_train = pd.DataFrame(model_grid_cv.predict(X_train), columns=[target])\n", " pred_test = pd.DataFrame(model_grid_cv.predict(X_test), columns=[target])\n", "\n", " # Mencatat skor MAE training dan test set\n", " mae_tuning.append(mean_absolute_error(y_train, pred_train))\n", " mae_tuning_test.append(mean_absolute_error(y_test, pred_test))\n", "\n", " # Mencatat skor R2 training dan test set\n", " r2_tuning.append(r2_score(y_train, pred_train))\n", " r2_tuning_test.append(r2_score(y_test, pred_test)) \n", " \n", " # Mencatat parameter terbaik di setiap model\n", " best_param.append(model_grid_cv.best_params_)\n", " \n", " # Merekam settingan modelnya\n", " best_estimator.append(model_grid_cv.best_estimator_)" ] }, { "cell_type": "code", "execution_count": 45, "id": "6215f70f-dc2a-438d-bf6c-c1e8e67e9e51", "metadata": {}, "outputs": [], "source": [ "# Membuat DataFrame sekaligus kita urutkan\n", "grid_mae = pd.DataFrame({'model':daftar_nama_model, 'Training':mae_tuning, 'Testing':mae_tuning_test})\n", "grid_mae_urut = grid_mae.sort_values(by='Testing', ignore_index=True)\n", "\n", "grid_r2 = pd.DataFrame({'model':daftar_nama_model, 'Training':r2_tuning, 'Testing':r2_tuning_test})\n", "grid_r2_urut = grid_r2.sort_values(by='Testing', ascending=False, ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 46, "id": "f4b8c07c-53b2-4333-ad11-1f1d58e95f3d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelTrainingTesting
0rf3.1731574.840727
1dt4.6599485.122358
2gboost3.0008845.129114
3elastic6.4958305.367278
4lasso6.5048375.377494
5ridge6.4697635.378675
6linreg6.4686975.380155
\n", "
" ], "text/plain": [ " model Training Testing\n", "0 rf 3.173157 4.840727\n", "1 dt 4.659948 5.122358\n", "2 gboost 3.000884 5.129114\n", "3 elastic 6.495830 5.367278\n", "4 lasso 6.504837 5.377494\n", "5 ridge 6.469763 5.378675\n", "6 linreg 6.468697 5.380155" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Melihat performa tuning berdasarkan MAE\n", "grid_mae_urut" ] }, { "cell_type": "code", "execution_count": 47, "id": "640bdfd5-b3a2-4ae9-96b5-dae1e2fecf61", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelTrainingTesting
0rf0.8526860.725545
1elastic0.5359940.704105
2lasso0.5360110.702151
3ridge0.5372120.701203
4linreg0.5372180.700731
5gboost0.8294080.700595
6dt0.7236940.688717
\n", "
" ], "text/plain": [ " model Training Testing\n", "0 rf 0.852686 0.725545\n", "1 elastic 0.535994 0.704105\n", "2 lasso 0.536011 0.702151\n", "3 ridge 0.537212 0.701203\n", "4 linreg 0.537218 0.700731\n", "5 gboost 0.829408 0.700595\n", "6 dt 0.723694 0.688717" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Melihat performa tuning berdasarkan R2\n", "grid_r2_urut" ] }, { "cell_type": "markdown", "id": "aa17dd53-86b2-4044-931f-d0b4076c1239", "metadata": {}, "source": [ "Dari model di atas dapat kita lihat bahwa model terbaik adalah menggunakan random forest yang menemapti urutan 1 pada nilai r2 dan mae.\n", "\n", "Sekarang kita bisa menyimpam parameter dan estimator dari setiap model dalam format DataFrame." ] }, { "cell_type": "code", "execution_count": 48, "id": "623db2d2-7146-417a-a8f3-dd9c590bbe8d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelParam
0linreg{}
1lasso{'lasso__alpha': 0.23}
2ridge{'ridge__alpha': 0.99, 'ridge__solver': 'svd'}
3elastic{'enet__alpha': 0.1, 'enet__l1_ratio': 0.5}
4dt{'dt__max_depth': 7, 'dt__min_samples_leaf': 2...
5rf{'rf__criterion': 'squared_error', 'rf__min_sa...
6gboost{'gboost__learning_rate': 0.1, 'gboost__max_de...
\n", "
" ], "text/plain": [ " model Param\n", "0 linreg {}\n", "1 lasso {'lasso__alpha': 0.23}\n", "2 ridge {'ridge__alpha': 0.99, 'ridge__solver': 'svd'}\n", "3 elastic {'enet__alpha': 0.1, 'enet__l1_ratio': 0.5}\n", "4 dt {'dt__max_depth': 7, 'dt__min_samples_leaf': 2...\n", "5 rf {'rf__criterion': 'squared_error', 'rf__min_sa...\n", "6 gboost {'gboost__learning_rate': 0.1, 'gboost__max_de..." ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "level_parameter = pd.DataFrame({'model': daftar_nama_model, 'Param' :best_param})\n", "level_parameter" ] }, { "cell_type": "code", "execution_count": 49, "id": "285e6eaa-5dcb-4aa0-9bd7-644c20028d31", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelParam
0linreg((SimpleImputer(), StandardScaler()), LinearRe...
1lasso((SimpleImputer(), StandardScaler()), Lasso(al...
2ridge((SimpleImputer(), StandardScaler()), Ridge(al...
3elastic((SimpleImputer(), StandardScaler()), ElasticN...
4dt((SimpleImputer(), StandardScaler()), Decision...
5rf((SimpleImputer(), StandardScaler()), (Decisio...
6gboost((SimpleImputer(), StandardScaler()), ([Decisi...
\n", "
" ], "text/plain": [ " model Param\n", "0 linreg ((SimpleImputer(), StandardScaler()), LinearRe...\n", "1 lasso ((SimpleImputer(), StandardScaler()), Lasso(al...\n", "2 ridge ((SimpleImputer(), StandardScaler()), Ridge(al...\n", "3 elastic ((SimpleImputer(), StandardScaler()), ElasticN...\n", "4 dt ((SimpleImputer(), StandardScaler()), Decision...\n", "5 rf ((SimpleImputer(), StandardScaler()), (Decisio...\n", "6 gboost ((SimpleImputer(), StandardScaler()), ([Decisi..." ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Estimator terbaik dari model terbaik\n", "level_estimator = pd.DataFrame({'model':daftar_nama_model, 'Param':best_estimator})\n", "level_estimator" ] }, { "cell_type": "code", "execution_count": 50, "id": "9c04f669-d8e2-4c41-aca3-f0546c9bd22a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'rf__criterion': 'squared_error',\n", " 'rf__min_samples_leaf': 3,\n", " 'rf__min_samples_split': 2}" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Parameter terbaik Random Forest Regression (model kelima)\n", "level_parameter['Param'][5]" ] }, { "cell_type": "code", "execution_count": 51, "id": "fbf9120e-39a8-4e26-9119-ac9f8568c824", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('preprocessor_numerik',\n",
       "                 Pipeline(steps=[('imputasi', SimpleImputer()),\n",
       "                                 ('scaling', StandardScaler())])),\n",
       "                ('rf',\n",
       "                 RandomForestRegressor(min_samples_leaf=3, random_state=0))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('preprocessor_numerik',\n", " Pipeline(steps=[('imputasi', SimpleImputer()),\n", " ('scaling', StandardScaler())])),\n", " ('rf',\n", " RandomForestRegressor(min_samples_leaf=3, random_state=0))])" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "level_estimator['Param'][5]" ] }, { "cell_type": "code", "execution_count": 52, "id": "cd6d52c9-b224-4f25-ad91-e7ac9c924fa5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelParamTesting
0rf{'rf__criterion': 'squared_error', 'rf__min_sa...0.725545
1elastic{'enet__alpha': 0.1, 'enet__l1_ratio': 0.5}0.704105
2lasso{'lasso__alpha': 0.23}0.702151
3ridge{'ridge__alpha': 0.99, 'ridge__solver': 'svd'}0.701203
4linreg{}0.700731
5gboost{'gboost__learning_rate': 0.1, 'gboost__max_de...0.700595
6dt{'dt__max_depth': 7, 'dt__min_samples_leaf': 2...0.688717
\n", "
" ], "text/plain": [ " model Param Testing\n", "0 rf {'rf__criterion': 'squared_error', 'rf__min_sa... 0.725545\n", "1 elastic {'enet__alpha': 0.1, 'enet__l1_ratio': 0.5} 0.704105\n", "2 lasso {'lasso__alpha': 0.23} 0.702151\n", "3 ridge {'ridge__alpha': 0.99, 'ridge__solver': 'svd'} 0.701203\n", "4 linreg {} 0.700731\n", "5 gboost {'gboost__learning_rate': 0.1, 'gboost__max_de... 0.700595\n", "6 dt {'dt__max_depth': 7, 'dt__min_samples_leaf': 2... 0.688717" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Kita gabungkan semua dan urutkan berdasarkan nilai R2 di test set sebagai 'model_best_param'\n", "model_best_param = pd.DataFrame({'model':daftar_nama_model, 'Param':best_param, 'Testing':r2_tuning_test})\n", "model_best_param = model_best_param.sort_values(by='Testing', ascending=False, ignore_index=True)\n", "model_best_param" ] }, { "cell_type": "code", "execution_count": 53, "id": "4f05ade9-0beb-40ba-a14f-aff9fe9432dc", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelParamTesting
0rf((SimpleImputer(), StandardScaler()), (Decisio...0.725545
1elastic((SimpleImputer(), StandardScaler()), ElasticN...0.704105
2lasso((SimpleImputer(), StandardScaler()), Lasso(al...0.702151
3ridge((SimpleImputer(), StandardScaler()), Ridge(al...0.701203
4linreg((SimpleImputer(), StandardScaler()), LinearRe...0.700731
5gboost((SimpleImputer(), StandardScaler()), ([Decisi...0.700595
6dt((SimpleImputer(), StandardScaler()), Decision...0.688717
\n", "
" ], "text/plain": [ " model Param Testing\n", "0 rf ((SimpleImputer(), StandardScaler()), (Decisio... 0.725545\n", "1 elastic ((SimpleImputer(), StandardScaler()), ElasticN... 0.704105\n", "2 lasso ((SimpleImputer(), StandardScaler()), Lasso(al... 0.702151\n", "3 ridge ((SimpleImputer(), StandardScaler()), Ridge(al... 0.701203\n", "4 linreg ((SimpleImputer(), StandardScaler()), LinearRe... 0.700731\n", "5 gboost ((SimpleImputer(), StandardScaler()), ([Decisi... 0.700595\n", "6 dt ((SimpleImputer(), StandardScaler()), Decision... 0.688717" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# menggabungkan semua dan urutkan berdasarkan nilai R2 di test set sebagai 'model_best_estimator'\n", "model_best_estimator = pd.DataFrame({'model':daftar_nama_model, 'Param':best_estimator, 'Testing':r2_tuning_test})\n", "model_best_estimator = model_best_estimator.sort_values(by='Testing', ascending=False, ignore_index=True)\n", "model_best_estimator" ] }, { "cell_type": "markdown", "id": "cb4b8a22-3093-4c9c-8517-6975fe7f3fd4", "metadata": {}, "source": [ "
" ] }, { "cell_type": "markdown", "id": "d1a67760-4840-4e84-8fc0-8df37648e63d", "metadata": {}, "source": [ "## Menggunakan Model Terbaik" ] }, { "cell_type": "markdown", "id": "44678a08-51df-41bb-991f-e912f138fff1", "metadata": {}, "source": [ "mentraining parameter model terbaik denga dataset penuh" ] }, { "cell_type": "markdown", "id": "bb5bdfd0-e7a9-4194-8dcb-0e97ace61ad6", "metadata": {}, "source": [ "### memanggil data awal proses tadi" ] }, { "cell_type": "code", "execution_count": 54, "id": "3d58c279-75ad-4497-bf0d-2b6d7aec0514", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NoDateAgeDistance_MRTTotal_SotresLatitudelongitudeprice
012012.91666732.084.878821024.98298121.5402437.9
122012.91666719.5306.59470924.98034121.5395142.2
232013.58333313.3561.98450524.98746121.5439147.3
342013.50000013.3561.98450524.98746121.5439154.8
452012.8333335.0390.56840524.97937121.5424543.1
\n", "
" ], "text/plain": [ " No Date Age Distance_MRT Total_Sotres Latitude longitude \\\n", "0 1 2012.916667 32.0 84.87882 10 24.98298 121.54024 \n", "1 2 2012.916667 19.5 306.59470 9 24.98034 121.53951 \n", "2 3 2013.583333 13.3 561.98450 5 24.98746 121.54391 \n", "3 4 2013.500000 13.3 561.98450 5 24.98746 121.54391 \n", "4 5 2012.833333 5.0 390.56840 5 24.97937 121.54245 \n", "\n", " price \n", "0 37.9 \n", "1 42.2 \n", "2 47.3 \n", "3 54.8 \n", "4 43.1 " ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 55, "id": "8bb4ca42-78d8-43a7-a090-00574bf24483", "metadata": {}, "outputs": [], "source": [ "# Kita bagi variabel independen dan dependen-nya\n", "X = df.iloc[:,2:-1]\n", "y = df.iloc[:,-1]" ] }, { "cell_type": "code", "execution_count": 56, "id": "a187a519-df74-41f0-943e-3d609b1148de", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeDistance_MRTTotal_SotresLatitudelongitude
032.084.878821024.98298121.54024
119.5306.59470924.98034121.53951
213.3561.98450524.98746121.54391
313.3561.98450524.98746121.54391
45.0390.56840524.97937121.54245
\n", "
" ], "text/plain": [ " Age Distance_MRT Total_Sotres Latitude longitude\n", "0 32.0 84.87882 10 24.98298 121.54024\n", "1 19.5 306.59470 9 24.98034 121.53951\n", "2 13.3 561.98450 5 24.98746 121.54391\n", "3 13.3 561.98450 5 24.98746 121.54391\n", "4 5.0 390.56840 5 24.97937 121.54245" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.head()" ] }, { "cell_type": "code", "execution_count": 57, "id": "0b93dd3a-e435-4561-96ea-291d3eec90b6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 37.9\n", "1 42.2\n", "2 47.3\n", "3 54.8\n", "4 43.1\n", "Name: price, dtype: float64" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y.head()" ] }, { "cell_type": "markdown", "id": "e0625540-1ce4-4cf2-a880-6c541fcddeac", "metadata": {}, "source": [ "
" ] }, { "cell_type": "markdown", "id": "6b3745e4-1ac5-4339-8a11-f9db9244f941", "metadata": {}, "source": [ "### Training Ulang Model Akhir Dengan Dataset Utuh" ] }, { "cell_type": "code", "execution_count": 58, "id": "529af8c3-1c3e-4d3a-acb7-1c6c7810a7a8", "metadata": {}, "outputs": [], "source": [ "model_akhir_otomastis = model_best_estimator['Param'][0].fit(X,y)" ] }, { "cell_type": "code", "execution_count": 59, "id": "de6d6f87-7666-4f85-bc5c-6073ded5250f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('preprocessor_numerik',\n",
       "                 Pipeline(steps=[('imputasi', SimpleImputer()),\n",
       "                                 ('scaling', StandardScaler())])),\n",
       "                ('rf',\n",
       "                 RandomForestRegressor(min_samples_leaf=3, random_state=0))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('preprocessor_numerik',\n", " Pipeline(steps=[('imputasi', SimpleImputer()),\n", " ('scaling', StandardScaler())])),\n", " ('rf',\n", " RandomForestRegressor(min_samples_leaf=3, random_state=0))])" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model_akhir_otomastis" ] }, { "cell_type": "code", "execution_count": 60, "id": "708690c5-3451-42a1-953c-83ec8a8da42a", "metadata": {}, "outputs": [], "source": [ "# Mencoba memprediksi training dan test set setelah fitting di training set, kemudian dikemas dalam format DataFrame\n", "pred_train_otomatis = pd.DataFrame(model_akhir_otomastis.predict(X_train), columns=[target])\n", "pred_test_otomatis = pd.DataFrame(model_akhir_otomastis.predict(X_test), columns=[target])" ] }, { "cell_type": "code", "execution_count": 61, "id": "35bce141-56eb-4e10-8702-cb7845b3fd5a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PrediksiAsliError
037.95695537.40.556955
150.38757151.81.412429
258.55061858.10.450618
348.73164349.50.768357
428.89996330.61.700037
\n", "
" ], "text/plain": [ " Prediksi Asli Error\n", "0 37.956955 37.4 0.556955\n", "1 50.387571 51.8 1.412429\n", "2 58.550618 58.1 0.450618\n", "3 48.731643 49.5 0.768357\n", "4 28.899963 30.6 1.700037" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Membandingkan 2 numpy array (kita gunakan np.column_stack untuk menggabungkan kedua array ini)\n", "banding_train_otomatis = pd.DataFrame(np.column_stack((pred_train_otomatis, pd.DataFrame(y_train), abs(pred_train_otomatis - pd.DataFrame(y_train)))), columns=['Prediksi', 'Asli', 'Error'])\n", "banding_train_otomatis.head()" ] }, { "cell_type": "code", "execution_count": 62, "id": "ada4228b-1f1d-4bc9-a4d8-3da838797114", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PrediksiAsliError
count331.000000331.000000331.000000
mean38.54412738.7129913.134446
std12.09096013.8144144.203389
min14.6032337.6000000.001991
25%27.90955328.4500000.904570
50%39.58501939.3000002.021594
75%47.10655447.3000003.943572
max67.117270117.50000050.743411
\n", "
" ], "text/plain": [ " Prediksi Asli Error\n", "count 331.000000 331.000000 331.000000\n", "mean 38.544127 38.712991 3.134446\n", "std 12.090960 13.814414 4.203389\n", "min 14.603233 7.600000 0.001991\n", "25% 27.909553 28.450000 0.904570\n", "50% 39.585019 39.300000 2.021594\n", "75% 47.106554 47.300000 3.943572\n", "max 67.117270 117.500000 50.743411" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "banding_train_otomatis.describe()" ] }, { "cell_type": "code", "execution_count": 63, "id": "aa309118-7caf-4ec6-8677-9ffb2ec09d29", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PrediksiAsliError
025.36513327.31.934867
149.58813754.44.811863
223.53769622.01.537696
315.09149211.63.491492
453.59684845.48.196848
\n", "
" ], "text/plain": [ " Prediksi Asli Error\n", "0 25.365133 27.3 1.934867\n", "1 49.588137 54.4 4.811863\n", "2 23.537696 22.0 1.537696\n", "3 15.091492 11.6 3.491492\n", "4 53.596848 45.4 8.196848" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "banding_test_otomatis = pd.DataFrame(np.column_stack((pred_test_otomatis, pd.DataFrame(y_test), abs(pred_test_otomatis - pd.DataFrame(y_test)))), columns=['Prediksi', 'Asli', 'Error'])\n", "banding_test_otomatis.head()" ] }, { "cell_type": "code", "execution_count": 64, "id": "8ab31797-2d61-4732-80b7-d289f1c55215", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PrediksiAsliError
count83.00000083.00000083.000000
mean36.25075935.0578312.812076
std12.36558512.3950432.521332
min15.09149211.6000000.009281
25%26.50542923.8000000.901083
50%37.82587036.9000001.934867
75%45.36631742.9000004.188026
max59.66195659.60000010.897020
\n", "
" ], "text/plain": [ " Prediksi Asli Error\n", "count 83.000000 83.000000 83.000000\n", "mean 36.250759 35.057831 2.812076\n", "std 12.365585 12.395043 2.521332\n", "min 15.091492 11.600000 0.009281\n", "25% 26.505429 23.800000 0.901083\n", "50% 37.825870 36.900000 1.934867\n", "75% 45.366317 42.900000 4.188026\n", "max 59.661956 59.600000 10.897020" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "banding_test_otomatis.describe()" ] }, { "cell_type": "code", "execution_count": 65, "id": "d1e227b2-5b8c-4db1-8a31-70368be52a41", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Error train set 3.134\n", "Error test set 2.812\n" ] } ], "source": [ "# Melihat skor MAE setelah tuning\n", "print(f'Error train set {mean_absolute_error(y_train, pred_train_otomatis):.3f}')\n", "print(f'Error test set {mean_absolute_error(y_test, pred_test_otomatis):.3f}')" ] }, { "cell_type": "code", "execution_count": 66, "id": "98b3d4e8-d29f-4f4b-9fc5-e457c179cf92", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "R2 train set = 0.856\n", "R2 test set = 0.907\n" ] } ], "source": [ "# Melihat skor R2 setelah tuning\n", "print(f'R2 train set = {r2_score(y_train, pred_train_otomatis):.3f}')\n", "print(f'R2 test set = {r2_score(y_test, pred_test_otomatis):.3f}')" ] }, { "cell_type": "code", "execution_count": 67, "id": "3ee46963-7fdb-4103-9230-b11f48b07fb3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.856\n", "0.907\n" ] } ], "source": [ "# Melihat R2 dengan menggunakan method 'score'\n", "print(round(model_akhir_otomastis.score(X_train, y_train),3))\n", "print(round(model_akhir_otomastis.score(X_test, y_test),3))" ] }, { "cell_type": "markdown", "id": "105af1d2-b011-48a6-8f33-151a7aeb4470", "metadata": {}, "source": [ "
" ] }, { "cell_type": "markdown", "id": "1985fe84-7f01-4667-a947-707579a3b2a9", "metadata": {}, "source": [ "### Menyimpan Model" ] }, { "cell_type": "code", "execution_count": 68, "id": "409525c6-8d7d-4ab2-b56c-cf0547b92ea7", "metadata": {}, "outputs": [], "source": [ "import pickle" ] }, { "cell_type": "code", "execution_count": 69, "id": "f2090b0a-fb3f-47a4-bbdb-a9e90ddd56aa", "metadata": {}, "outputs": [], "source": [ "# Menyimpan model dengan nama 'model_regresi_terbaik.pkl'\n", "pickle.dump(model_akhir_otomastis, open('model_regresi_realestate.pkl', 'wb'))" ] }, { "cell_type": "code", "execution_count": 70, "id": "39d4f558-5b16-4065-984b-ccc8a214d3bd", "metadata": {}, "outputs": [], "source": [ "best_model = pickle.load(open('model_regresi_realestate.pkl', 'rb'))" ] }, { "cell_type": "code", "execution_count": 71, "id": "a4c9fd28-fa77-4d6d-b45b-0f7b2571a9a6", "metadata": {}, "outputs": [], "source": [ "prediksi = best_model.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 72, "id": "9a33eb60-b4e0-4c49-a786-04642e863f51", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([25.36513312, 49.58813707, 23.53769632, 15.09149192, 53.5968484 ,\n", " 26.62990923, 24.92314482, 33.47161876, 38.99839109, 18.41769968,\n", " 17.40928084, 37.14860224, 23.57370049, 44.16059913, 54.79656104,\n", " 22.36188506, 53.5968484 , 43.39913588, 27.41186374, 48.52032763,\n", " 46.1403558 , 23.97747107, 37.8747926 , 59.12870727, 51.22921429,\n", " 26.3939 , 29.4381557 , 51.67359258, 47.2277215 , 17.16195631,\n", " 30.20013785, 37.47691944, 41.48889206, 50.94102314, 46.70533535,\n", " 17.7374671 , 37.82586962, 37.58559887, 42.259326 , 17.1534004 ,\n", " 39.74715667, 16.25023175, 59.66195558, 30.57614654, 29.44505325,\n", " 26.61695768, 22.80101331, 42.93427837, 38.04899524, 37.82812385,\n", " 17.7374671 , 55.19701974, 43.05059957, 16.0955438 , 49.28513763,\n", " 40.71226944, 53.5968484 , 38.95569908, 36.69922615, 40.08712758,\n", " 15.79309127, 34.49000397, 27.89192287, 17.7374671 , 41.94553939,\n", " 37.25059571, 51.28433846, 41.46778748, 16.12176154, 41.67784149,\n", " 48.8599261 , 16.93294358, 33.99375173, 29.11361454, 54.81626342,\n", " 36.84870188, 33.30433813, 38.91979075, 34.03963373, 51.47944844,\n", " 43.48584838, 49.82051014, 44.59227832])" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prediksi" ] }, { "cell_type": "code", "execution_count": 73, "id": "d4df069b-2893-4eaa-b938-9beae29d9a51", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
price
025.365133
149.588137
223.537696
315.091492
453.596848
......
7834.039634
7951.479448
8043.485848
8149.820510
8244.592278
\n", "

83 rows × 1 columns

\n", "
" ], "text/plain": [ " price\n", "0 25.365133\n", "1 49.588137\n", "2 23.537696\n", "3 15.091492\n", "4 53.596848\n", ".. ...\n", "78 34.039634\n", "79 51.479448\n", "80 43.485848\n", "81 49.820510\n", "82 44.592278\n", "\n", "[83 rows x 1 columns]" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prediksi = pd.DataFrame(prediksi, columns=[target])\n", "prediksi" ] }, { "cell_type": "code", "execution_count": null, "id": "ff09aa4d-26b5-4d04-b0f1-52adf1765a6c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 5 }