{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "c0b8d60a", "metadata": { "id": "c0b8d60a" }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "sns.set_style(\"darkgrid\")\n", "sns.set_palette('RdYlGn')\n", "\n", "#model\n", "from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import mean_squared_error, r2_score\n", "from sklearn.ensemble import RandomForestRegressor\n", "from xgboost import XGBRegressor\n", "from sklearn.linear_model import LinearRegression\n", "\n", "import gradio as gr\n", "import joblib" ] }, { "cell_type": "code", "execution_count": 4, "id": "11273e4d", "metadata": { "id": "11273e4d" }, "outputs": [], "source": [ "df = pd.read_csv(\"/content/Nigerian_Car_Prices.csv\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "dffa0dba", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 340 }, "id": "dffa0dba", "outputId": "eb17a45d-8e91-41b5-ddae-0be82f2fe1f6" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Unnamed: 0 Make Year of manufacture Condition Mileage \\\n", "0 0 Toyota 2007.0 Nigerian Used 166418.0 \n", "1 1 Lexus NaN NaN 138024.0 \n", "2 2 Mercedes-Benz 2008.0 Nigerian Used 376807.0 \n", "3 3 Lexus NaN NaN 213362.0 \n", "4 4 Mercedes-Benz NaN NaN 106199.0 \n", "\n", " Engine Size Fuel Transmission Price Build \n", "0 2400.0 Petrol Automatic 3,120,000 NaN \n", "1 NaN NaN Automatic 5,834,000 NaN \n", "2 3000.0 Petrol Automatic 3,640,000 NaN \n", "3 NaN NaN Automatic 3,594,000 NaN \n", "4 NaN NaN Automatic 8,410,000 NaN " ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0MakeYear of manufactureConditionMileageEngine SizeFuelTransmissionPriceBuild
00Toyota2007.0Nigerian Used166418.02400.0PetrolAutomatic3,120,000NaN
11LexusNaNNaN138024.0NaNNaNAutomatic5,834,000NaN
22Mercedes-Benz2008.0Nigerian Used376807.03000.0PetrolAutomatic3,640,000NaN
33LexusNaNNaN213362.0NaNNaNAutomatic3,594,000NaN
44Mercedes-BenzNaNNaN106199.0NaNNaNAutomatic8,410,000NaN
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 5 } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 6, "id": "30f57450", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "30f57450", "outputId": "462327ca-b494-4cc7-d8d1-aa765e166650" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 4095 entries, 0 to 4094\n", "Data columns (total 10 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Unnamed: 0 4095 non-null int64 \n", " 1 Make 4095 non-null object \n", " 2 Year of manufacture 3617 non-null float64\n", " 3 Condition 3616 non-null object \n", " 4 Mileage 4024 non-null float64\n", " 5 Engine Size 3584 non-null float64\n", " 6 Fuel 3607 non-null object \n", " 7 Transmission 4075 non-null object \n", " 8 Price 4095 non-null object \n", " 9 Build 1127 non-null object \n", "dtypes: float64(3), int64(1), object(6)\n", "memory usage: 320.0+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "markdown", "id": "2b138a73", "metadata": { "id": "2b138a73" }, "source": [ "### Data Cleaning" ] }, { "cell_type": "code", "execution_count": 7, "id": "fd78bcc0", "metadata": { "id": "fd78bcc0" }, "outputs": [], "source": [ "df = df.drop('Build', axis = 1)" ] }, { "cell_type": "code", "execution_count": 8, "id": "60013f82", "metadata": { "id": "60013f82" }, "outputs": [], "source": [ "df = df.dropna()" ] }, { "cell_type": "code", "execution_count": 9, "id": "62b833d4", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "62b833d4", "outputId": "05f88dbc-c2db-45be-c1c1-0f8553706eae" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(3523, 9)" ] }, "metadata": {}, "execution_count": 9 } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 10, "id": "e04b4172", "metadata": { "id": "e04b4172" }, "outputs": [], "source": [ "df['Price'] = df['Price'].str.replace(',', '') \n", "df['Price'] = df['Price'].astype(float) \n", "\n", "df['Year of manufacture'] = df['Year of manufacture'].astype(int) " ] }, { "cell_type": "code", "execution_count": 11, "id": "c62daca5", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 300 }, "id": "c62daca5", "outputId": "6639a400-6ded-4f42-cbe5-4469c7fa27f2" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Unnamed: 0 Year of manufacture Mileage Engine Size \\\n", "count 3523.000000 3523.000000 3.523000e+03 3523.000000 \n", "mean 2089.276753 2007.921090 1.901794e+05 3170.591541 \n", "std 1187.608368 4.303771 2.215162e+05 4641.379934 \n", "min 0.000000 1992.000000 1.000000e+00 3.000000 \n", "25% 1066.500000 2005.000000 1.070360e+05 2000.000000 \n", "50% 2085.000000 2008.000000 1.670060e+05 2500.000000 \n", "75% 3136.500000 2011.000000 2.397715e+05 3500.000000 \n", "max 4094.000000 2021.000000 9.976050e+06 184421.000000 \n", "\n", " Price \n", "count 3.523000e+03 \n", "mean 4.060590e+06 \n", "std 4.520306e+06 \n", "min 4.725000e+05 \n", "25% 1.800000e+06 \n", "50% 2.835000e+06 \n", "75% 4.500000e+06 \n", "max 5.880000e+07 " ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0Year of manufactureMileageEngine SizePrice
count3523.0000003523.0000003.523000e+033523.0000003.523000e+03
mean2089.2767532007.9210901.901794e+053170.5915414.060590e+06
std1187.6083684.3037712.215162e+054641.3799344.520306e+06
min0.0000001992.0000001.000000e+003.0000004.725000e+05
25%1066.5000002005.0000001.070360e+052000.0000001.800000e+06
50%2085.0000002008.0000001.670060e+052500.0000002.835000e+06
75%3136.5000002011.0000002.397715e+053500.0000004.500000e+06
max4094.0000002021.0000009.976050e+06184421.0000005.880000e+07
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 11 } ], "source": [ "df.describe()" ] }, { "cell_type": "markdown", "id": "910be70f", "metadata": { "id": "910be70f" }, "source": [ "### EDA" ] }, { "cell_type": "markdown", "id": "90e49305", "metadata": { "id": "90e49305" }, "source": [ "### Feature Engineering" ] }, { "cell_type": "code", "source": [ "#the brand new is just 5, it will be drop\n", "# Dropping the 'Brand New' category\n", "df = df[df['Condition'] != 'Brand New']" ], "metadata": { "id": "PkF02_5ah3bB" }, "id": "PkF02_5ah3bB", "execution_count": 35, "outputs": [] }, { "cell_type": "code", "execution_count": 38, "id": "544f2b81", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "544f2b81", "outputId": "efdf1889-b1b6-445c-901a-acab17d1cda1" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['scaler.joblib']" ] }, "metadata": {}, "execution_count": 38 } ], "source": [ "X = df.drop(['Unnamed: 0', 'Price'], axis = 1)\n", "y = df.Price\n", "\n", "make_counts = X['Make'].value_counts()\n", "\n", "\n", "# Get the values to replace with 'Others'\n", "make_others = make_counts[make_counts < 14].index.tolist()\n", "\n", "# Replace values with 'Others'\n", "X['Make'] = X['Make'].apply(lambda x: 'Others' if x in make_others else x)\n", "\n", "X_train,X_test, y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state=10)\n", "\n", "\n", "# Initializing the encoders and scaler for each column\n", "make_encoder = LabelEncoder()\n", "fuel_encoder = LabelEncoder()\n", "transmission_encoder = LabelEncoder()\n", "condition_encoder = LabelEncoder()\n", "scaler = MinMaxScaler()\n", "\n", "# Encoding and scaling each column individually\n", "X_train['Make'] = make_encoder.fit_transform(X_train['Make'])\n", "X_test['Make'] = make_encoder.transform(X_test['Make'])\n", "\n", "X_train['Fuel'] = fuel_encoder.fit_transform(X_train['Fuel'])\n", "X_test['Fuel'] = fuel_encoder.transform(X_test['Fuel'])\n", "\n", "X_train['Transmission'] = transmission_encoder.fit_transform(X_train['Transmission'])\n", "X_test['Transmission'] = transmission_encoder.transform(X_test['Transmission'])\n", "\n", "X_train['Condition'] = condition_encoder.fit_transform(X_train['Condition'])\n", "X_test['Condition'] = condition_encoder.transform(X_test['Condition'])\n", "\n", "X_train[['Year of manufacture', 'Mileage', 'Engine Size']] = scaler.fit_transform(X_train[['Year of manufacture', 'Mileage', 'Engine Size']])\n", "X_test[['Year of manufacture', 'Mileage', 'Engine Size']] = scaler.transform(X_test[['Year of manufacture', 'Mileage', 'Engine Size']])\n", "\n", "# Save the encoders and scaler\n", "joblib.dump(make_encoder, \"make_encoder.joblib\",compress=3)\n", "joblib.dump(fuel_encoder, \"fuel_encoder.joblib\",compress=3)\n", "joblib.dump(transmission_encoder, \"transmission_encoder.joblib\",compress=3)\n", "joblib.dump(condition_encoder, \"condition_encoder.joblib\",compress=3)\n", "joblib.dump(scaler, \"scaler.joblib\",compress=3)" ] }, { "cell_type": "markdown", "id": "307eab41", "metadata": { "id": "307eab41" }, "source": [ "#### Needed Model" ] }, { "cell_type": "code", "execution_count": 39, "id": "23aaa0f7", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "23aaa0f7", "outputId": "7ac3f946-76f2-4e32-bda3-84106fcec209" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Random Forest RMSE: 1900923.15\n", "XGBoost RMSE: 1881430.11\n", "Linear Regression RMSE: 3227815.24\n" ] } ], "source": [ "# Initialize the models\n", "rf_model = RandomForestRegressor(random_state=42)\n", "xgb_model = XGBRegressor(random_state=42)\n", "lr_model = LinearRegression()\n", "\n", "# Fit the models on the training data\n", "rf_model.fit(X_train, y_train)\n", "xgb_model.fit(X_train, y_train)\n", "lr_model.fit(X_train, y_train)\n", "\n", "# Make predictions on the testing data\n", "rf_preds = rf_model.predict(X_test)\n", "xgb_preds = xgb_model.predict(X_test)\n", "lr_preds = lr_model.predict(X_test)\n", "\n", "# Evaluate the models using root mean squared error (RMSE)\n", "rf_rmse = mean_squared_error(y_test, rf_preds, squared=False)\n", "xgb_rmse = mean_squared_error(y_test, xgb_preds, squared=False)\n", "lr_rmse = mean_squared_error(y_test, lr_preds, squared=False)\n", "\n", "# Print the RMSE scores\n", "print(f\"Random Forest RMSE: {rf_rmse:.2f}\")\n", "print(f\"XGBoost RMSE: {xgb_rmse:.2f}\")\n", "print(f\"Linear Regression RMSE: {lr_rmse:.2f}\")" ] }, { "cell_type": "code", "source": [ "# R2 score\n", "rf_r2 = r2_score(y_test, rf_preds)\n", "print(\"Random Forest R2 Score:\", rf_r2)\n", "\n", "\n", "xgb_r2 = r2_score(y_test, xgb_preds)\n", "print(\"XGBoost R2 Score:\", xgb_r2)\n", "\n", "\n", "lr_r2 = r2_score(y_test, lr_preds)\n", "print(\"Linear Regression R2 Score:\", lr_r2)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "HAij8ecNkQf4", "outputId": "cfeb36b4-201b-413a-8b4f-ce722b9d7ef3" }, "id": "HAij8ecNkQf4", "execution_count": 40, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Random Forest R2 Score: 0.7692007346747749\n", "XGBoost R2 Score: 0.7739099336774033\n", "Linear Regression R2 Score: 0.33453895627915986\n" ] } ] }, { "cell_type": "code", "execution_count": 41, "id": "f9dfda36", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f9dfda36", "outputId": "69882d26-6915-4f06-c5af-d38ce97417cd" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['car_model.joblib']" ] }, "metadata": {}, "execution_count": 41 } ], "source": [ "joblib.dump(xgb_model, \"car_model.joblib\", compress=3)" ] }, { "cell_type": "markdown", "id": "faeff4c7", "metadata": { "id": "faeff4c7" }, "source": [ "**Note: Many Models have been built, but only the needed ones were kept**" ] }, { "cell_type": "code", "execution_count": 42, "id": "1b6ca9be", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 472 }, "id": "1b6ca9be", "outputId": "a049c64e-ea4f-44d3-9bfb-4a03cc01a7cf" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ], "source": [ "sns.histplot(xgb_preds, label='prediction',color='red')\n", "sns.histplot(y_test, label='actual price', color = 'blue')\n", "plt.title('Prediction Vs Actual')\n", "plt.legend()\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "e921f047", "metadata": { "id": "e921f047" }, "source": [ "### Prediction" ] }, { "cell_type": "code", "execution_count": 43, "id": "e23ac604", "metadata": { "id": "e23ac604" }, "outputs": [], "source": [ "import joblib\n", "def predict_car_price(make, year, condition, mileage, engine_size, fuel, transmission):\n", " # Load the encoders and scaler\n", " make_encoder = joblib.load(\"make_encoder.joblib\")\n", " fuel_encoder = joblib.load(\"fuel_encoder.joblib\")\n", " transmission_encoder = joblib.load(\"transmission_encoder.joblib\")\n", " condition_encoder = joblib.load(\"condition_encoder.joblib\")\n", " scaler = joblib.load(\"scaler.joblib\")\n", "\n", " # Preprocess the input\n", " make_encoded = make_encoder.transform([make])[0]\n", " numerical_value = scaler.transform([[year,mileage, engine_size]])\n", " year_scaled = numerical_value[0][0]\n", " mileage_scaled = numerical_value[0][1]\n", " engine_size_scaled = numerical_value[0][2]\n", " fuel_encoded = fuel_encoder.transform([fuel])[0]\n", " condition_encoded = condition_encoder.transform([condition])[0]\n", " transmission_encoded = transmission_encoder.transform([transmission])[0]\n", "\n", " input_data = [[make_encoded, year_scaled, condition_encoded, mileage_scaled, engine_size_scaled, fuel_encoded, transmission_encoded]]\n", " input_df = pd.DataFrame(input_data, columns=['Make', 'Year of manufacture', 'Condition', 'Mileage', 'Engine Size', 'Fuel', 'Transmission'])\n", "\n", " # Make predictions\n", " predicted_price = xgb_model.predict(input_df)\n", " return round(predicted_price[0], 2)" ] }, { "cell_type": "code", "execution_count": 44, "id": "07692f2e", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "07692f2e", "outputId": "c70a6f63-72db-4129-e38a-2f319e506f35" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "4970118.0" ] }, "metadata": {}, "execution_count": 44 } ], "source": [ "predict_car_price('Toyota', 2010,'Nigerian Used', 3000, 2300, 'Petrol', 'Automatic')" ] }, { "cell_type": "markdown", "id": "fce6ae74", "metadata": { "id": "fce6ae74" }, "source": [ "### Gradio Interface" ] }, { "cell_type": "code", "source": [ "import gradio as gr\n", "import joblib\n", "def predict_car_price(make, year, condition, mileage, engine_size, fuel, transmission):\n", " # Load the encoders and scaler\n", " make_encoder = joblib.load(\"make_encoder.joblib\")\n", " fuel_encoder = joblib.load(\"fuel_encoder.joblib\")\n", " transmission_encoder = joblib.load(\"transmission_encoder.joblib\")\n", " condition_encoder = joblib.load(\"condition_encoder.joblib\")\n", " scaler = joblib.load(\"scaler.joblib\")\n", "\n", " make_encoded = make_encoder.transform([make])[0]\n", " numerical_value = scaler.transform([[year,mileage, engine_size]])\n", " year_scaled = numerical_value[0][0]\n", " mileage_scaled = numerical_value[0][1]\n", " engine_size_scaled = numerical_value[0][2]\n", " fuel_encoded = fuel_encoder.transform([fuel])[0]\n", " condition_encoded = condition_encoder.transform([condition])[0]\n", " transmission_encoded = transmission_encoder.transform([transmission])[0]\n", " input_data = [[make_encoded, year_scaled, condition_encoded, mileage_scaled, engine_size_scaled, fuel_encoded, transmission_encoded]]\n", " input_df = pd.DataFrame(input_data, columns=['Make', 'Year of manufacture', 'Condition', 'Mileage', 'Engine Size', 'Fuel', 'Transmission'])\n", "\n", " # Make predictions\n", " predicted_price = xgb_model.predict(input_df)\n", " return round(predicted_price[0], 2)\n", "make_dropdown = gr.inputs.Dropdown(['Acura', 'Audi', 'BMW', 'Chevrolet', 'Dodge', 'Ford', 'Honda',\n", " 'Hyundai', 'Infiniti', 'Kia', 'Land Rover', 'Lexus', 'Mazda',\n", " 'Mercedes-Benz', 'Mitsubishi', 'Nissan', 'Peugeot',\n", " 'Pontiac', 'Toyota', 'Volkswagen', 'Volvo'], label=\"Make\")\n", "condition_dropdown = gr.inputs.Dropdown(['Foreign Used', 'Nigerian Used'], label=\"Condition\")\n", "fuel_dropdown = gr.inputs.Dropdown([\"Petrol\", \"Diesel\", \"Electric\"], label=\"Fuel\")\n", "transmission_dropdown = gr.inputs.Dropdown([\"Manual\", \"Automatic\", \"AMT\"], label=\"Transmission\")\n", "year_slider = gr.inputs.Slider(minimum=1992, maximum=2021, step=1, default=2010, label=\"Year\")\n", "mileage_slider = gr.inputs.Slider(minimum=1, maximum=300000, step=10, default=80000, label=\"Mileage\")\n", "engine_size_slider = gr.inputs.Slider(minimum=1, maximum=20000, step=1, default=100, label=\"Engine Size\")\n", "\n", "iface = gr.Interface(\n", "fn=predict_car_price,\n", "inputs=[make_dropdown, year_slider, condition_dropdown, mileage_slider, engine_size_slider, fuel_dropdown, transmission_dropdown],\n", "outputs=\"number\",\n", "title=\"Car Price Prediction\",\n", " description=\"Predict the price of a car based on its details, in Naira.\",\n", " examples=[\n", " [\"Toyota\", 2010, \"Nigerian Used\", 80000, 2.0, \"Petrol\", \"Automatic\"],\n", " [\"Mercedes-Benz\", 2015, \"Foreign Used\", 50000, 1000, \"Diesel\", \"AMT\"],\n", " ],css=\".gradio-container {background-color: lightgreen}\"\n", ")\n", "\n", "iface.launch(share = True)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 611 }, "id": "0ZNR9WJ5m5dA", "outputId": "b4292dcc-3397-46db-d5b2-3932ff51c657" }, "id": "0ZNR9WJ5m5dA", "execution_count": 46, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Colab notebook detected. To show errors in colab notebook, set debug=True in launch()\n", "Running on public URL: https://99918e8c858d7db896.gradio.live\n", "\n", "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "
" ] }, "metadata": {} }, { "output_type": "execute_result", "data": { "text/plain": [] }, "metadata": {}, "execution_count": 46 } ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" }, "colab": { "provenance": [] } }, "nbformat": 4, "nbformat_minor": 5 }