diff --git "a/HuggingFace/h8dsft_P1G1_KeziaIntan.ipynb" "b/HuggingFace/h8dsft_P1G1_KeziaIntan.ipynb" new file mode 100644--- /dev/null +++ "b/HuggingFace/h8dsft_P1G1_KeziaIntan.ipynb" @@ -0,0 +1,2577 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GRADED CHALLENGE 1" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "### Identity\n", + "```txt\n", + "\n", + "KEZIA INTAN NATALIE\n", + "Batch 003" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Problem Statement\n", + "- Mampu memahami konsep regression dengan Linear Regression.\n", + "- Mampu mempersiapkan data untuk digunakan dalam model Linear Regression.\n", + "- Mampu mengimplementasikan Linear Regression untuk membuat prediksi." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Objective Statement\n", + "Buatlah model Regression menggunakan Linear Regression untuk memprediksi harga perjalanan platform ride-hailing dengan dataset yang disediakan." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import Library" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "first of all import the library that we use" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "ename": "ImportError", + "evalue": "cannot import name '_check_y' from 'sklearn.utils.validation' (c:\\Users\\LENOVO\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py)", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mImportError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmodel_selection\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mtrain_test_split\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mfeature_engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moutliers\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mWinsorizer\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 8\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpreprocessing\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mStandardScaler\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mMinMaxScaler\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mOneHotEncoder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mOrdinalEncoder\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlinear_model\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mLinearRegression\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mLasso\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mRidge\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\Users\\LENOVO\\anaconda3\\lib\\site-packages\\feature_engine\\outliers\\__init__.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 3\u001b[0m \"\"\"\n\u001b[0;32m 4\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0martbitrary\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mArbitraryOutlierCapper\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mtrimmer\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mOutlierTrimmer\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mwinsorizer\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mWinsorizer\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\Users\\LENOVO\\anaconda3\\lib\\site-packages\\feature_engine\\outliers\\artbitrary.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 24\u001b[0m \u001b[0m_find_or_check_numerical_variables\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 25\u001b[0m )\n\u001b[1;32m---> 26\u001b[1;33m from feature_engine.dataframe_checks import (\n\u001b[0m\u001b[0;32m 27\u001b[0m \u001b[0m_check_contains_inf\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 28\u001b[0m \u001b[0m_check_contains_na\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\Users\\LENOVO\\anaconda3\\lib\\site-packages\\feature_engine\\dataframe_checks.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mscipy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msparse\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0missparse\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mutils\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalidation\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0m_check_y\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcheck_consistent_length\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 11\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mImportError\u001b[0m: cannot import name '_check_y' from 'sklearn.utils.validation' (c:\\Users\\LENOVO\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py)" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from feature_engine.outliers import Winsorizer\n", + "from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder\n", + "from sklearn.linear_model import LinearRegression, Lasso, Ridge\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", + "\n", + "import joblib\n", + "\n", + "import warnings\n", + "warnings.filterwarnings(action='ignore')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading Data" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "load the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#import dataset\n", + "df = pd.read_csv('rideshare_kaggle.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#making dataframe showing all columns\n", + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestamphourdaymonthdatetimetimezonesourcedestinationcab_typeproduct_idnamepricedistancesurge_multiplierlatitudelongitudetemperatureapparentTemperatureshort_summarylong_summaryprecipIntensityprecipProbabilityhumiditywindSpeedwindGustwindGustTimevisibilitytemperatureHightemperatureHighTimetemperatureLowtemperatureLowTimeapparentTemperatureHighapparentTemperatureHighTimeapparentTemperatureLowapparentTemperatureLowTimeicondewPointpressurewindBearingcloudCoveruvIndexvisibility.1ozonesunriseTimesunsetTimemoonPhaseprecipIntensityMaxuvIndexTimetemperatureMintemperatureMinTimetemperatureMaxtemperatureMaxTimeapparentTemperatureMinapparentTemperatureMinTimeapparentTemperatureMaxapparentTemperatureMaxTime
0424553bb-7174-41ea-aeb4-fe06d4f4b9d71.544953e+09916122018-12-16 09:30:07America/New_YorkHaymarket SquareNorth StationLyftlyft_lineShared5.00.441.042.2148-71.03342.3437.12Mostly CloudyRain throughout the day.0.00000.00.688.669.17154501560010.00043.68154496880034.19154504800037.95154496880027.391545044400partly-cloudy-night32.701021.98570.72010.000303.8154496208415449948640.300.1276154497960039.89154501200043.68154496880033.73154501200038.071544958000
14bd23055-6827-41c6-b23b-3c491f24e74d1.543284e+09227112018-11-27 02:00:23America/New_YorkHaymarket SquareNorth StationLyftlyft_premierLux11.00.441.042.2148-71.03343.5837.35RainRain until morning, starting again in the eve...0.12991.00.9411.9811.9815432912004.78647.30154325160042.10154329840043.92154325160036.201543291200rain41.831003.97901.0004.786291.1154323296915432669920.640.1300154325160040.49154323360047.30154325160036.20154329120043.921543251600
2981a3613-77af-4620-a42a-0c0866077d1e1.543367e+09128112018-11-28 01:00:22America/New_YorkHaymarket SquareNorth StationLyftlyftLyft7.00.441.042.2148-71.03338.3332.93ClearLight rain in the morning.0.00000.00.757.337.33154333440010.00047.55154332000033.10154340280044.12154332000029.111543392000clear-night31.10992.282400.03010.000315.7154331943715433533640.680.1064154333800035.36154337760047.55154332000031.04154337760044.121543320000
3c2d88af2-d278-4bfd-a8d0-29ca77cc55121.543554e+09430112018-11-30 04:53:02America/New_YorkHaymarket SquareNorth StationLyftlyft_luxsuvLux Black XL26.00.441.042.2148-71.03334.3829.63ClearPartly cloudy throughout the day.0.00000.00.735.285.28154351440010.00045.03154351080028.90154357920038.53154351080026.201543575600clear-night26.641013.733100.00010.000291.1154349237015435261140.750.0000154350720034.67154355040045.03154351080030.30154355040038.531543510800
4e0126e1f-8ca9-4f2e-82b3-50505a09db9a1.543463e+09329112018-11-29 03:49:20America/New_YorkHaymarket SquareNorth StationLyftlyft_plusLyft XL9.00.441.042.2148-71.03337.4430.88Partly CloudyMostly cloudy throughout the day.0.00000.00.709.149.14154344600010.00042.18154342080036.71154347840035.75154342080030.291543460400partly-cloudy-night28.61998.363030.44010.000347.7154340590415434397380.720.0001154342080033.10154340280042.18154342080029.11154339200035.751543420800
\n", + "
" + ], + "text/plain": [ + " id timestamp hour day month \\\n", + "0 424553bb-7174-41ea-aeb4-fe06d4f4b9d7 1.544953e+09 9 16 12 \n", + "1 4bd23055-6827-41c6-b23b-3c491f24e74d 1.543284e+09 2 27 11 \n", + "2 981a3613-77af-4620-a42a-0c0866077d1e 1.543367e+09 1 28 11 \n", + "3 c2d88af2-d278-4bfd-a8d0-29ca77cc5512 1.543554e+09 4 30 11 \n", + "4 e0126e1f-8ca9-4f2e-82b3-50505a09db9a 1.543463e+09 3 29 11 \n", + "\n", + " datetime timezone source destination \\\n", + "0 2018-12-16 09:30:07 America/New_York Haymarket Square North Station \n", + "1 2018-11-27 02:00:23 America/New_York Haymarket Square North Station \n", + "2 2018-11-28 01:00:22 America/New_York Haymarket Square North Station \n", + "3 2018-11-30 04:53:02 America/New_York Haymarket Square North Station \n", + "4 2018-11-29 03:49:20 America/New_York Haymarket Square North Station \n", + "\n", + " cab_type product_id name price distance surge_multiplier \\\n", + "0 Lyft lyft_line Shared 5.0 0.44 1.0 \n", + "1 Lyft lyft_premier Lux 11.0 0.44 1.0 \n", + "2 Lyft lyft Lyft 7.0 0.44 1.0 \n", + "3 Lyft lyft_luxsuv Lux Black XL 26.0 0.44 1.0 \n", + "4 Lyft lyft_plus Lyft XL 9.0 0.44 1.0 \n", + "\n", + " latitude longitude temperature apparentTemperature short_summary \\\n", + "0 42.2148 -71.033 42.34 37.12 Mostly Cloudy \n", + "1 42.2148 -71.033 43.58 37.35 Rain \n", + "2 42.2148 -71.033 38.33 32.93 Clear \n", + "3 42.2148 -71.033 34.38 29.63 Clear \n", + "4 42.2148 -71.033 37.44 30.88 Partly Cloudy \n", + "\n", + " long_summary precipIntensity \\\n", + "0 Rain throughout the day. 0.0000 \n", + "1 Rain until morning, starting again in the eve... 0.1299 \n", + "2 Light rain in the morning. 0.0000 \n", + "3 Partly cloudy throughout the day. 0.0000 \n", + "4 Mostly cloudy throughout the day. 0.0000 \n", + "\n", + " precipProbability humidity windSpeed windGust windGustTime visibility \\\n", + "0 0.0 0.68 8.66 9.17 1545015600 10.000 \n", + "1 1.0 0.94 11.98 11.98 1543291200 4.786 \n", + "2 0.0 0.75 7.33 7.33 1543334400 10.000 \n", + "3 0.0 0.73 5.28 5.28 1543514400 10.000 \n", + "4 0.0 0.70 9.14 9.14 1543446000 10.000 \n", + "\n", + " temperatureHigh temperatureHighTime temperatureLow temperatureLowTime \\\n", + "0 43.68 1544968800 34.19 1545048000 \n", + "1 47.30 1543251600 42.10 1543298400 \n", + "2 47.55 1543320000 33.10 1543402800 \n", + "3 45.03 1543510800 28.90 1543579200 \n", + "4 42.18 1543420800 36.71 1543478400 \n", + "\n", + " apparentTemperatureHigh apparentTemperatureHighTime \\\n", + "0 37.95 1544968800 \n", + "1 43.92 1543251600 \n", + "2 44.12 1543320000 \n", + "3 38.53 1543510800 \n", + "4 35.75 1543420800 \n", + "\n", + " apparentTemperatureLow apparentTemperatureLowTime icon \\\n", + "0 27.39 1545044400 partly-cloudy-night \n", + "1 36.20 1543291200 rain \n", + "2 29.11 1543392000 clear-night \n", + "3 26.20 1543575600 clear-night \n", + "4 30.29 1543460400 partly-cloudy-night \n", + "\n", + " dewPoint pressure windBearing cloudCover uvIndex visibility.1 ozone \\\n", + "0 32.70 1021.98 57 0.72 0 10.000 303.8 \n", + "1 41.83 1003.97 90 1.00 0 4.786 291.1 \n", + "2 31.10 992.28 240 0.03 0 10.000 315.7 \n", + "3 26.64 1013.73 310 0.00 0 10.000 291.1 \n", + "4 28.61 998.36 303 0.44 0 10.000 347.7 \n", + "\n", + " sunriseTime sunsetTime moonPhase precipIntensityMax uvIndexTime \\\n", + "0 1544962084 1544994864 0.30 0.1276 1544979600 \n", + "1 1543232969 1543266992 0.64 0.1300 1543251600 \n", + "2 1543319437 1543353364 0.68 0.1064 1543338000 \n", + "3 1543492370 1543526114 0.75 0.0000 1543507200 \n", + "4 1543405904 1543439738 0.72 0.0001 1543420800 \n", + "\n", + " temperatureMin temperatureMinTime temperatureMax temperatureMaxTime \\\n", + "0 39.89 1545012000 43.68 1544968800 \n", + "1 40.49 1543233600 47.30 1543251600 \n", + "2 35.36 1543377600 47.55 1543320000 \n", + "3 34.67 1543550400 45.03 1543510800 \n", + "4 33.10 1543402800 42.18 1543420800 \n", + "\n", + " apparentTemperatureMin apparentTemperatureMinTime apparentTemperatureMax \\\n", + "0 33.73 1545012000 38.07 \n", + "1 36.20 1543291200 43.92 \n", + "2 31.04 1543377600 44.12 \n", + "3 30.30 1543550400 38.53 \n", + "4 29.11 1543392000 35.75 \n", + "\n", + " apparentTemperatureMaxTime \n", + "0 1544958000 \n", + "1 1543251600 \n", + "2 1543320000 \n", + "3 1543510800 \n", + "4 1543420800 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#showing dataset\n", + "df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "checking the missing value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "timestamp 0\n", + "hour 0\n", + "day 0\n", + "month 0\n", + "datetime 0\n", + "timezone 0\n", + "source 0\n", + "destination 0\n", + "cab_type 0\n", + "product_id 0\n", + "name 0\n", + "price 55095\n", + "distance 0\n", + "surge_multiplier 0\n", + "latitude 0\n", + "longitude 0\n", + "temperature 0\n", + "apparentTemperature 0\n", + "short_summary 0\n", + "long_summary 0\n", + "precipIntensity 0\n", + "precipProbability 0\n", + "humidity 0\n", + "windSpeed 0\n", + "windGust 0\n", + "windGustTime 0\n", + "visibility 0\n", + "temperatureHigh 0\n", + "temperatureHighTime 0\n", + "temperatureLow 0\n", + "temperatureLowTime 0\n", + "apparentTemperatureHigh 0\n", + "apparentTemperatureHighTime 0\n", + "apparentTemperatureLow 0\n", + "apparentTemperatureLowTime 0\n", + "icon 0\n", + "dewPoint 0\n", + "pressure 0\n", + "windBearing 0\n", + "cloudCover 0\n", + "uvIndex 0\n", + "visibility.1 0\n", + "ozone 0\n", + "sunriseTime 0\n", + "sunsetTime 0\n", + "moonPhase 0\n", + "precipIntensityMax 0\n", + "uvIndexTime 0\n", + "temperatureMin 0\n", + "temperatureMinTime 0\n", + "temperatureMax 0\n", + "temperatureMaxTime 0\n", + "apparentTemperatureMin 0\n", + "apparentTemperatureMinTime 0\n", + "apparentTemperatureMax 0\n", + "apparentTemperatureMaxTime 0\n", + "dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#count the missing value\n", + "df.isnull().sum()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "now drop the missing value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#drop missing value\n", + "df = df.dropna()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "now check how many unique value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 637976\n", + "timestamp 35796\n", + "hour 24\n", + "day 17\n", + "month 2\n", + "datetime 31350\n", + "timezone 1\n", + "source 12\n", + "destination 12\n", + "cab_type 2\n", + "product_id 12\n", + "name 12\n", + "price 147\n", + "distance 549\n", + "surge_multiplier 7\n", + "latitude 11\n", + "longitude 12\n", + "temperature 308\n", + "apparentTemperature 319\n", + "short_summary 9\n", + "long_summary 11\n", + "precipIntensity 63\n", + "precipProbability 29\n", + "humidity 51\n", + "windSpeed 291\n", + "windGust 286\n", + "windGustTime 25\n", + "visibility 227\n", + "temperatureHigh 129\n", + "temperatureHighTime 23\n", + "temperatureLow 133\n", + "temperatureLowTime 31\n", + "apparentTemperatureHigh 124\n", + "apparentTemperatureHighTime 27\n", + "apparentTemperatureLow 136\n", + "apparentTemperatureLowTime 32\n", + "icon 7\n", + "dewPoint 313\n", + "pressure 316\n", + "windBearing 195\n", + "cloudCover 83\n", + "uvIndex 3\n", + "visibility.1 227\n", + "ozone 274\n", + "sunriseTime 110\n", + "sunsetTime 114\n", + "moonPhase 18\n", + "precipIntensityMax 65\n", + "uvIndexTime 20\n", + "temperatureMin 131\n", + "temperatureMinTime 25\n", + "temperatureMax 128\n", + "temperatureMaxTime 23\n", + "apparentTemperatureMin 137\n", + "apparentTemperatureMinTime 29\n", + "apparentTemperatureMax 125\n", + "apparentTemperatureMaxTime 27\n", + "dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#check unique value\n", + "df.nunique()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "and check is there any duplicate data or not" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#check data duplicate\n", + "df.duplicated().sum()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Spliting Numberical and Categorical Columns" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "now split the numberical and categorical columns" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "here we only choose the distance column for numerical column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_col= ['distance']" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "now we choose the icon, name, cab_type columns for categorical column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cat_col= ['icon', 'name', 'cab_type']" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "now describe the numerical and categorical columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
distance
count637976.000000
mean2.189261
std1.135413
min0.020000
25%1.270000
50%2.160000
75%2.930000
max7.860000
\n", + "
" + ], + "text/plain": [ + " distance\n", + "count 637976.000000\n", + "mean 2.189261\n", + "std 1.135413\n", + "min 0.020000\n", + "25% 1.270000\n", + "50% 2.160000\n", + "75% 2.930000\n", + "max 7.860000" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#describe the numerical column\n", + "df[num_col].describe()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "as we can see the mean of distance are around 2.1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iconnamecab_type
count637976637976637976
unique7122
topcloudyUberXLUber
freq20142955096330568
\n", + "
" + ], + "text/plain": [ + " icon name cab_type\n", + "count 637976 637976 637976\n", + "unique 7 12 2\n", + "top cloudy UberXL Uber\n", + "freq 201429 55096 330568" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#describe the categorical column\n", + "df[cat_col].describe()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "there are 7 unique label in icon, 12 unique label in name, 2 unique label in cab_type" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exploratory Data Analysis (EDA)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "copy our dataset, so if there is any mistake in our worksheet we can use the copied of the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#copy the dataset\n", + "df_copy = df.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtimestamphourdaymonthdatetimetimezonesourcedestinationcab_typeproduct_idnamepricedistancesurge_multiplierlatitudelongitudetemperatureapparentTemperatureshort_summarylong_summaryprecipIntensityprecipProbabilityhumiditywindSpeedwindGustwindGustTimevisibilitytemperatureHightemperatureHighTimetemperatureLowtemperatureLowTimeapparentTemperatureHighapparentTemperatureHighTimeapparentTemperatureLowapparentTemperatureLowTimeicondewPointpressurewindBearingcloudCoveruvIndexvisibility.1ozonesunriseTimesunsetTimemoonPhaseprecipIntensityMaxuvIndexTimetemperatureMintemperatureMinTimetemperatureMaxtemperatureMaxTimeapparentTemperatureMinapparentTemperatureMinTimeapparentTemperatureMaxapparentTemperatureMaxTime
0424553bb-7174-41ea-aeb4-fe06d4f4b9d71.544953e+09916122018-12-16 09:30:07America/New_YorkHaymarket SquareNorth StationLyftlyft_lineShared5.00.441.042.2148-71.03342.3437.12Mostly CloudyRain throughout the day.0.00000.00.688.669.17154501560010.00043.68154496880034.19154504800037.95154496880027.391545044400partly-cloudy-night32.701021.98570.72010.000303.8154496208415449948640.300.1276154497960039.89154501200043.68154496880033.73154501200038.071544958000
14bd23055-6827-41c6-b23b-3c491f24e74d1.543284e+09227112018-11-27 02:00:23America/New_YorkHaymarket SquareNorth StationLyftlyft_premierLux11.00.441.042.2148-71.03343.5837.35RainRain until morning, starting again in the eve...0.12991.00.9411.9811.9815432912004.78647.30154325160042.10154329840043.92154325160036.201543291200rain41.831003.97901.0004.786291.1154323296915432669920.640.1300154325160040.49154323360047.30154325160036.20154329120043.921543251600
2981a3613-77af-4620-a42a-0c0866077d1e1.543367e+09128112018-11-28 01:00:22America/New_YorkHaymarket SquareNorth StationLyftlyftLyft7.00.441.042.2148-71.03338.3332.93ClearLight rain in the morning.0.00000.00.757.337.33154333440010.00047.55154332000033.10154340280044.12154332000029.111543392000clear-night31.10992.282400.03010.000315.7154331943715433533640.680.1064154333800035.36154337760047.55154332000031.04154337760044.121543320000
3c2d88af2-d278-4bfd-a8d0-29ca77cc55121.543554e+09430112018-11-30 04:53:02America/New_YorkHaymarket SquareNorth StationLyftlyft_luxsuvLux Black XL26.00.441.042.2148-71.03334.3829.63ClearPartly cloudy throughout the day.0.00000.00.735.285.28154351440010.00045.03154351080028.90154357920038.53154351080026.201543575600clear-night26.641013.733100.00010.000291.1154349237015435261140.750.0000154350720034.67154355040045.03154351080030.30154355040038.531543510800
4e0126e1f-8ca9-4f2e-82b3-50505a09db9a1.543463e+09329112018-11-29 03:49:20America/New_YorkHaymarket SquareNorth StationLyftlyft_plusLyft XL9.00.441.042.2148-71.03337.4430.88Partly CloudyMostly cloudy throughout the day.0.00000.00.709.149.14154344600010.00042.18154342080036.71154347840035.75154342080030.291543460400partly-cloudy-night28.61998.363030.44010.000347.7154340590415434397380.720.0001154342080033.10154340280042.18154342080029.11154339200035.751543420800
\n", + "
" + ], + "text/plain": [ + " id timestamp hour day month \\\n", + "0 424553bb-7174-41ea-aeb4-fe06d4f4b9d7 1.544953e+09 9 16 12 \n", + "1 4bd23055-6827-41c6-b23b-3c491f24e74d 1.543284e+09 2 27 11 \n", + "2 981a3613-77af-4620-a42a-0c0866077d1e 1.543367e+09 1 28 11 \n", + "3 c2d88af2-d278-4bfd-a8d0-29ca77cc5512 1.543554e+09 4 30 11 \n", + "4 e0126e1f-8ca9-4f2e-82b3-50505a09db9a 1.543463e+09 3 29 11 \n", + "\n", + " datetime timezone source destination \\\n", + "0 2018-12-16 09:30:07 America/New_York Haymarket Square North Station \n", + "1 2018-11-27 02:00:23 America/New_York Haymarket Square North Station \n", + "2 2018-11-28 01:00:22 America/New_York Haymarket Square North Station \n", + "3 2018-11-30 04:53:02 America/New_York Haymarket Square North Station \n", + "4 2018-11-29 03:49:20 America/New_York Haymarket Square North Station \n", + "\n", + " cab_type product_id name price distance surge_multiplier \\\n", + "0 Lyft lyft_line Shared 5.0 0.44 1.0 \n", + "1 Lyft lyft_premier Lux 11.0 0.44 1.0 \n", + "2 Lyft lyft Lyft 7.0 0.44 1.0 \n", + "3 Lyft lyft_luxsuv Lux Black XL 26.0 0.44 1.0 \n", + "4 Lyft lyft_plus Lyft XL 9.0 0.44 1.0 \n", + "\n", + " latitude longitude temperature apparentTemperature short_summary \\\n", + "0 42.2148 -71.033 42.34 37.12 Mostly Cloudy \n", + "1 42.2148 -71.033 43.58 37.35 Rain \n", + "2 42.2148 -71.033 38.33 32.93 Clear \n", + "3 42.2148 -71.033 34.38 29.63 Clear \n", + "4 42.2148 -71.033 37.44 30.88 Partly Cloudy \n", + "\n", + " long_summary precipIntensity \\\n", + "0 Rain throughout the day. 0.0000 \n", + "1 Rain until morning, starting again in the eve... 0.1299 \n", + "2 Light rain in the morning. 0.0000 \n", + "3 Partly cloudy throughout the day. 0.0000 \n", + "4 Mostly cloudy throughout the day. 0.0000 \n", + "\n", + " precipProbability humidity windSpeed windGust windGustTime visibility \\\n", + "0 0.0 0.68 8.66 9.17 1545015600 10.000 \n", + "1 1.0 0.94 11.98 11.98 1543291200 4.786 \n", + "2 0.0 0.75 7.33 7.33 1543334400 10.000 \n", + "3 0.0 0.73 5.28 5.28 1543514400 10.000 \n", + "4 0.0 0.70 9.14 9.14 1543446000 10.000 \n", + "\n", + " temperatureHigh temperatureHighTime temperatureLow temperatureLowTime \\\n", + "0 43.68 1544968800 34.19 1545048000 \n", + "1 47.30 1543251600 42.10 1543298400 \n", + "2 47.55 1543320000 33.10 1543402800 \n", + "3 45.03 1543510800 28.90 1543579200 \n", + "4 42.18 1543420800 36.71 1543478400 \n", + "\n", + " apparentTemperatureHigh apparentTemperatureHighTime \\\n", + "0 37.95 1544968800 \n", + "1 43.92 1543251600 \n", + "2 44.12 1543320000 \n", + "3 38.53 1543510800 \n", + "4 35.75 1543420800 \n", + "\n", + " apparentTemperatureLow apparentTemperatureLowTime icon \\\n", + "0 27.39 1545044400 partly-cloudy-night \n", + "1 36.20 1543291200 rain \n", + "2 29.11 1543392000 clear-night \n", + "3 26.20 1543575600 clear-night \n", + "4 30.29 1543460400 partly-cloudy-night \n", + "\n", + " dewPoint pressure windBearing cloudCover uvIndex visibility.1 ozone \\\n", + "0 32.70 1021.98 57 0.72 0 10.000 303.8 \n", + "1 41.83 1003.97 90 1.00 0 4.786 291.1 \n", + "2 31.10 992.28 240 0.03 0 10.000 315.7 \n", + "3 26.64 1013.73 310 0.00 0 10.000 291.1 \n", + "4 28.61 998.36 303 0.44 0 10.000 347.7 \n", + "\n", + " sunriseTime sunsetTime moonPhase precipIntensityMax uvIndexTime \\\n", + "0 1544962084 1544994864 0.30 0.1276 1544979600 \n", + "1 1543232969 1543266992 0.64 0.1300 1543251600 \n", + "2 1543319437 1543353364 0.68 0.1064 1543338000 \n", + "3 1543492370 1543526114 0.75 0.0000 1543507200 \n", + "4 1543405904 1543439738 0.72 0.0001 1543420800 \n", + "\n", + " temperatureMin temperatureMinTime temperatureMax temperatureMaxTime \\\n", + "0 39.89 1545012000 43.68 1544968800 \n", + "1 40.49 1543233600 47.30 1543251600 \n", + "2 35.36 1543377600 47.55 1543320000 \n", + "3 34.67 1543550400 45.03 1543510800 \n", + "4 33.10 1543402800 42.18 1543420800 \n", + "\n", + " apparentTemperatureMin apparentTemperatureMinTime apparentTemperatureMax \\\n", + "0 33.73 1545012000 38.07 \n", + "1 36.20 1543291200 43.92 \n", + "2 31.04 1543377600 44.12 \n", + "3 30.30 1543550400 38.53 \n", + "4 29.11 1543392000 35.75 \n", + "\n", + " apparentTemperatureMaxTime \n", + "0 1544958000 \n", + "1 1543251600 \n", + "2 1543320000 \n", + "3 1543510800 \n", + "4 1543420800 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_copy.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "grouping the column name by price" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df_copy.groupby('name')[['price']].mean().sort_values('price').plot(kind='bar')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "as we can see the `Lux Black XL` is the most expensive prices" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "now grouping icon by price" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df_copy.groupby('icon')[['price']].mean().sort_values('price').plot(kind='bar')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preprocessing" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train-Test Split / Split Dataset" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "split the dataset between feature and target" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# define feature and target\n", + "x = df_copy.drop('price', axis=1)\n", + "y = df_copy['price']\n", + "\n", + "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "split train set and test set based on column types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_train_num = x_train[num_col]\n", + "x_train_cat = x_train[cat_col]\n", + "\n", + "x_test_num = x_test[num_col]\n", + "x_test_cat = x_test[cat_col]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EDA For Modeling" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Numerical Feature" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "define the numerical column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
distance
1956830.74
116123.40
4056873.05
273594.48
4677810.61
......
3906820.55
1654071.50
1281322.58
4733232.82
3319410.57
\n", + "

446583 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " distance\n", + "195683 0.74\n", + "11612 3.40\n", + "405687 3.05\n", + "27359 4.48\n", + "467781 0.61\n", + "... ...\n", + "390682 0.55\n", + "165407 1.50\n", + "128132 2.58\n", + "473323 2.82\n", + "331941 0.57\n", + "\n", + "[446583 rows x 1 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_train_t = x_train[num_col]\n", + "x_train_t" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Handling Outlier" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "now see if there is outlier or not" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def diagnostic_plots(x_train, variable):\n", + " # Define figure size\n", + " plt.figure(figsize=(16, 4))\n", + "\n", + " # Histogram\n", + " plt.subplot(1, 2, 1)\n", + " sns.histplot(x_train[variable], bins=30)\n", + " plt.title('Histogram')\n", + "\n", + " # Boxplot\n", + " plt.subplot(1, 2, 2)\n", + " sns.boxplot(y=x_train[variable])\n", + " plt.title('Boxplot')\n", + "\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "diagnostic_plots(x_train, 'distance')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "there is outlier in `distance` column" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "capping the outliers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "windsoriser = Winsorizer(capping_method='iqr', # choose iqr for IQR rule boundaries or gaussian for mean and std\n", + " tail='both', # cap left, right or both tails \n", + " fold=1.5,\n", + " variables=['distance'])\n", + "\n", + "windsoriser.fit(x_train_t)\n", + "\n", + "x_train_cap = windsoriser.transform(x_train_t)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Left Tail Caps : {'distance': -1.195}\n", + "Right Tail Caps : {'distance': 5.405}\n" + ] + } + ], + "source": [ + "# Inspect the minimum caps for each variable\n", + "print('Left Tail Caps : ', windsoriser.left_tail_caps_)\n", + "\n", + "# Iinspect the maximum caps for each variable\n", + "print('Right Tail Caps : ', windsoriser.right_tail_caps_)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "now see the different between before and after capping" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Before Capping\n", + " distance\n", + "count 446583.00000\n", + "mean 2.19019\n", + "std 1.13540\n", + "min 0.02000\n", + "25% 1.28000\n", + "50% 2.16000\n", + "75% 2.93000\n", + "max 7.86000\n", + "\n", + "After Capping\n", + " distance\n", + "count 446583.000000\n", + "mean 2.181211\n", + "std 1.103221\n", + "min 0.020000\n", + "25% 1.280000\n", + "50% 2.160000\n", + "75% 2.930000\n", + "max 5.405000\n" + ] + } + ], + "source": [ + "print('Before Capping')\n", + "print(x_train_t.describe())\n", + "print('')\n", + "print('After Capping')\n", + "print(x_train_cap.describe())" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "there is no more outliers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "diagnostic_plots(x_train_cap, 'distance')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Feature Encoding" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "now encode the categorical column using ordinal encoder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[2., 2., 0.],\n", + " [4., 3., 0.],\n", + " [6., 2., 0.],\n", + " ...,\n", + " [2., 3., 0.],\n", + " [6., 6., 0.],\n", + " [5., 7., 0.]])" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#encoding\n", + "encoder = OrdinalEncoder()\n", + "encoder.fit(x_train_cat)\n", + "\n", + "x_train_cat_enc = encoder.transform(x_train_cat)\n", + "x_test_cat_enc = encoder.transform(x_test_cat)\n", + "\n", + "x_train_cat_enc" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Feature Scaling" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "and scale the numerical column using minmaxscaler because the data is abnormal distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.09183673],\n", + " [0.43112245],\n", + " [0.38647959],\n", + " ...,\n", + " [0.32653061],\n", + " [0.35714286],\n", + " [0.07015306]])" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#scaling\n", + "scale = MinMaxScaler()\n", + "scale.fit(x_train_num)\n", + "\n", + "x_train_num_scale = scale.transform(x_train_num)\n", + "x_test_num_scale = scale.transform(x_test_num)\n", + "\n", + "x_train_num_scale" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Concat Feature Numberic and Categorial Columns" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "now concate the the train and test data that we had scale and encode" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_train_final = np.concatenate([x_train_num_scale, x_train_cat_enc], axis=1)\n", + "x_test_final = np.concatenate([x_test_num_scale, x_test_cat_enc], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
distanceiconnamecab_type
00.0918372.02.00.0
10.4311224.03.00.0
20.3864806.02.00.0
30.5688781.011.01.0
40.0752551.011.01.0
...............
4465780.0676022.00.01.0
4465790.1887766.011.01.0
4465800.3265312.03.00.0
4465810.3571436.06.00.0
4465820.0701535.07.00.0
\n", + "

446583 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " distance icon name cab_type\n", + "0 0.091837 2.0 2.0 0.0\n", + "1 0.431122 4.0 3.0 0.0\n", + "2 0.386480 6.0 2.0 0.0\n", + "3 0.568878 1.0 11.0 1.0\n", + "4 0.075255 1.0 11.0 1.0\n", + "... ... ... ... ...\n", + "446578 0.067602 2.0 0.0 1.0\n", + "446579 0.188776 6.0 11.0 1.0\n", + "446580 0.326531 2.0 3.0 0.0\n", + "446581 0.357143 6.0 6.0 0.0\n", + "446582 0.070153 5.0 7.0 0.0\n", + "\n", + "[446583 rows x 4 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_train_final_concat = pd.DataFrame(x_train_final, columns=[num_col+cat_col])\n", + "x_train_final_concat" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Definition" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "for define the model we use linear regression because there are correlation between feature and target" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#linear regression\n", + "linreg = LinearRegression()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "LinearRegression()" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#fit the model\n", + "linreg.fit(x_train_final, y_train)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([26.97765839, 5.89859759, 15.46590186, ..., 20.8248816 ,\n", + " 13.62962365, 24.47149278])" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_pred_train = linreg.predict(x_train_final)\n", + "y_pred_test = linreg.predict(x_test_final)\n", + "\n", + "y_pred_test" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "count the r^2 score and the mean absolute error" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "r2 score = 0.4702994909626852\n", + "r2 score = 0.47295163874958046\n", + "MAE = 5.301177482449108\n", + "MAE = 5.288114092458786\n" + ] + } + ], + "source": [ + "print('r2 score = ', r2_score(y_train, y_pred_train))\n", + "print('r2 score = ', r2_score(y_test, y_pred_test))\n", + "print('MAE = ', mean_absolute_error(y_train, y_pred_train))\n", + "print('MAE = ', mean_absolute_error(y_test, y_pred_test))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "for the `r2 score` getting closer to 1 is the better of our data and for for `mae` the farther away from 1 the worse is our data " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Saving" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "last save the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#save model\n", + "\n", + "with open('model_lin_reg.pkl', 'wb') as file_1:\n", + " joblib.dump(linreg, file_1)\n", + "\n", + "with open('model_scaler.pkl', 'wb') as file_2:\n", + " joblib.dump(scale, file_2)\n", + "\n", + "with open('model_encoder.pkl', 'wb') as file_3:\n", + " joblib.dump(encoder, file_3)\n", + "\n", + "with open('list_num_cols.txt', 'wb') as file_4:\n", + " joblib.dump(num_col, file_4)\n", + "\n", + "with open('list_cat_cols.txt', 'wb') as file_5:\n", + " joblib.dump(cat_col, file_5)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conceptual Problems" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Sebutkan dan jelaskan asumsi yang dipakai oleh Linear Regression !\n", + "2. Tunjukkan dan tafsirkan arti dari coefficient dan slope yang didapat dari model yang telah Anda bangun !" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. I use the linear regression because there are correlation between the independent variables which is the feature and dependent variables which us the target\n", + "2. slope is the gradien of a line that linking the x and y" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "the data is not to good for modeling because of the r2 score is 0.4 and the mae is 5.3 that means we got many error. but the inference that we got from random generate sample is quite good beacuse the the result of the target after predicted is around the target from our raw dataset." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "7679c2132d3f6ce38c9df14d554b39c06862b36a4e6689c81f9ae15bd0911d7d" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}