{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "c0b8d60a",
"metadata": {
"id": "c0b8d60a"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"sns.set_style(\"darkgrid\")\n",
"sns.set_palette('RdYlGn')\n",
"\n",
"#model\n",
"from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import mean_squared_error, r2_score\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from xgboost import XGBRegressor\n",
"from sklearn.linear_model import LinearRegression\n",
"\n",
"import gradio as gr\n",
"import joblib"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "11273e4d",
"metadata": {
"id": "11273e4d"
},
"outputs": [],
"source": [
"df = pd.read_csv(\"/content/Nigerian_Car_Prices.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "dffa0dba",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 340
},
"id": "dffa0dba",
"outputId": "eb17a45d-8e91-41b5-ddae-0be82f2fe1f6"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" Unnamed: 0 Make Year of manufacture Condition Mileage \\\n",
"0 0 Toyota 2007.0 Nigerian Used 166418.0 \n",
"1 1 Lexus NaN NaN 138024.0 \n",
"2 2 Mercedes-Benz 2008.0 Nigerian Used 376807.0 \n",
"3 3 Lexus NaN NaN 213362.0 \n",
"4 4 Mercedes-Benz NaN NaN 106199.0 \n",
"\n",
" Engine Size Fuel Transmission Price Build \n",
"0 2400.0 Petrol Automatic 3,120,000 NaN \n",
"1 NaN NaN Automatic 5,834,000 NaN \n",
"2 3000.0 Petrol Automatic 3,640,000 NaN \n",
"3 NaN NaN Automatic 3,594,000 NaN \n",
"4 NaN NaN Automatic 8,410,000 NaN "
],
"text/html": [
"\n",
"
\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" Make | \n",
" Year of manufacture | \n",
" Condition | \n",
" Mileage | \n",
" Engine Size | \n",
" Fuel | \n",
" Transmission | \n",
" Price | \n",
" Build | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" Toyota | \n",
" 2007.0 | \n",
" Nigerian Used | \n",
" 166418.0 | \n",
" 2400.0 | \n",
" Petrol | \n",
" Automatic | \n",
" 3,120,000 | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" Lexus | \n",
" NaN | \n",
" NaN | \n",
" 138024.0 | \n",
" NaN | \n",
" NaN | \n",
" Automatic | \n",
" 5,834,000 | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
" Mercedes-Benz | \n",
" 2008.0 | \n",
" Nigerian Used | \n",
" 376807.0 | \n",
" 3000.0 | \n",
" Petrol | \n",
" Automatic | \n",
" 3,640,000 | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" Lexus | \n",
" NaN | \n",
" NaN | \n",
" 213362.0 | \n",
" NaN | \n",
" NaN | \n",
" Automatic | \n",
" 3,594,000 | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" 4 | \n",
" Mercedes-Benz | \n",
" NaN | \n",
" NaN | \n",
" 106199.0 | \n",
" NaN | \n",
" NaN | \n",
" Automatic | \n",
" 8,410,000 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 5
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "30f57450",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "30f57450",
"outputId": "462327ca-b494-4cc7-d8d1-aa765e166650"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"RangeIndex: 4095 entries, 0 to 4094\n",
"Data columns (total 10 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Unnamed: 0 4095 non-null int64 \n",
" 1 Make 4095 non-null object \n",
" 2 Year of manufacture 3617 non-null float64\n",
" 3 Condition 3616 non-null object \n",
" 4 Mileage 4024 non-null float64\n",
" 5 Engine Size 3584 non-null float64\n",
" 6 Fuel 3607 non-null object \n",
" 7 Transmission 4075 non-null object \n",
" 8 Price 4095 non-null object \n",
" 9 Build 1127 non-null object \n",
"dtypes: float64(3), int64(1), object(6)\n",
"memory usage: 320.0+ KB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "markdown",
"id": "2b138a73",
"metadata": {
"id": "2b138a73"
},
"source": [
"### Data Cleaning"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "fd78bcc0",
"metadata": {
"id": "fd78bcc0"
},
"outputs": [],
"source": [
"df = df.drop('Build', axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "60013f82",
"metadata": {
"id": "60013f82"
},
"outputs": [],
"source": [
"df = df.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "62b833d4",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "62b833d4",
"outputId": "05f88dbc-c2db-45be-c1c1-0f8553706eae"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(3523, 9)"
]
},
"metadata": {},
"execution_count": 9
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "e04b4172",
"metadata": {
"id": "e04b4172"
},
"outputs": [],
"source": [
"df['Price'] = df['Price'].str.replace(',', '') \n",
"df['Price'] = df['Price'].astype(float) \n",
"\n",
"df['Year of manufacture'] = df['Year of manufacture'].astype(int) "
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "c62daca5",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 300
},
"id": "c62daca5",
"outputId": "6639a400-6ded-4f42-cbe5-4469c7fa27f2"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" Unnamed: 0 Year of manufacture Mileage Engine Size \\\n",
"count 3523.000000 3523.000000 3.523000e+03 3523.000000 \n",
"mean 2089.276753 2007.921090 1.901794e+05 3170.591541 \n",
"std 1187.608368 4.303771 2.215162e+05 4641.379934 \n",
"min 0.000000 1992.000000 1.000000e+00 3.000000 \n",
"25% 1066.500000 2005.000000 1.070360e+05 2000.000000 \n",
"50% 2085.000000 2008.000000 1.670060e+05 2500.000000 \n",
"75% 3136.500000 2011.000000 2.397715e+05 3500.000000 \n",
"max 4094.000000 2021.000000 9.976050e+06 184421.000000 \n",
"\n",
" Price \n",
"count 3.523000e+03 \n",
"mean 4.060590e+06 \n",
"std 4.520306e+06 \n",
"min 4.725000e+05 \n",
"25% 1.800000e+06 \n",
"50% 2.835000e+06 \n",
"75% 4.500000e+06 \n",
"max 5.880000e+07 "
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" Year of manufacture | \n",
" Mileage | \n",
" Engine Size | \n",
" Price | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 3523.000000 | \n",
" 3523.000000 | \n",
" 3.523000e+03 | \n",
" 3523.000000 | \n",
" 3.523000e+03 | \n",
"
\n",
" \n",
" mean | \n",
" 2089.276753 | \n",
" 2007.921090 | \n",
" 1.901794e+05 | \n",
" 3170.591541 | \n",
" 4.060590e+06 | \n",
"
\n",
" \n",
" std | \n",
" 1187.608368 | \n",
" 4.303771 | \n",
" 2.215162e+05 | \n",
" 4641.379934 | \n",
" 4.520306e+06 | \n",
"
\n",
" \n",
" min | \n",
" 0.000000 | \n",
" 1992.000000 | \n",
" 1.000000e+00 | \n",
" 3.000000 | \n",
" 4.725000e+05 | \n",
"
\n",
" \n",
" 25% | \n",
" 1066.500000 | \n",
" 2005.000000 | \n",
" 1.070360e+05 | \n",
" 2000.000000 | \n",
" 1.800000e+06 | \n",
"
\n",
" \n",
" 50% | \n",
" 2085.000000 | \n",
" 2008.000000 | \n",
" 1.670060e+05 | \n",
" 2500.000000 | \n",
" 2.835000e+06 | \n",
"
\n",
" \n",
" 75% | \n",
" 3136.500000 | \n",
" 2011.000000 | \n",
" 2.397715e+05 | \n",
" 3500.000000 | \n",
" 4.500000e+06 | \n",
"
\n",
" \n",
" max | \n",
" 4094.000000 | \n",
" 2021.000000 | \n",
" 9.976050e+06 | \n",
" 184421.000000 | \n",
" 5.880000e+07 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 11
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "markdown",
"id": "910be70f",
"metadata": {
"id": "910be70f"
},
"source": [
"### EDA"
]
},
{
"cell_type": "markdown",
"id": "90e49305",
"metadata": {
"id": "90e49305"
},
"source": [
"### Feature Engineering"
]
},
{
"cell_type": "code",
"source": [
"#the brand new is just 5, it will be drop\n",
"# Dropping the 'Brand New' category\n",
"df = df[df['Condition'] != 'Brand New']"
],
"metadata": {
"id": "PkF02_5ah3bB"
},
"id": "PkF02_5ah3bB",
"execution_count": 35,
"outputs": []
},
{
"cell_type": "code",
"execution_count": 38,
"id": "544f2b81",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "544f2b81",
"outputId": "efdf1889-b1b6-445c-901a-acab17d1cda1"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['scaler.joblib']"
]
},
"metadata": {},
"execution_count": 38
}
],
"source": [
"X = df.drop(['Unnamed: 0', 'Price'], axis = 1)\n",
"y = df.Price\n",
"\n",
"make_counts = X['Make'].value_counts()\n",
"\n",
"\n",
"# Get the values to replace with 'Others'\n",
"make_others = make_counts[make_counts < 14].index.tolist()\n",
"\n",
"# Replace values with 'Others'\n",
"X['Make'] = X['Make'].apply(lambda x: 'Others' if x in make_others else x)\n",
"\n",
"X_train,X_test, y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state=10)\n",
"\n",
"\n",
"# Initializing the encoders and scaler for each column\n",
"make_encoder = LabelEncoder()\n",
"fuel_encoder = LabelEncoder()\n",
"transmission_encoder = LabelEncoder()\n",
"condition_encoder = LabelEncoder()\n",
"scaler = MinMaxScaler()\n",
"\n",
"# Encoding and scaling each column individually\n",
"X_train['Make'] = make_encoder.fit_transform(X_train['Make'])\n",
"X_test['Make'] = make_encoder.transform(X_test['Make'])\n",
"\n",
"X_train['Fuel'] = fuel_encoder.fit_transform(X_train['Fuel'])\n",
"X_test['Fuel'] = fuel_encoder.transform(X_test['Fuel'])\n",
"\n",
"X_train['Transmission'] = transmission_encoder.fit_transform(X_train['Transmission'])\n",
"X_test['Transmission'] = transmission_encoder.transform(X_test['Transmission'])\n",
"\n",
"X_train['Condition'] = condition_encoder.fit_transform(X_train['Condition'])\n",
"X_test['Condition'] = condition_encoder.transform(X_test['Condition'])\n",
"\n",
"X_train[['Year of manufacture', 'Mileage', 'Engine Size']] = scaler.fit_transform(X_train[['Year of manufacture', 'Mileage', 'Engine Size']])\n",
"X_test[['Year of manufacture', 'Mileage', 'Engine Size']] = scaler.transform(X_test[['Year of manufacture', 'Mileage', 'Engine Size']])\n",
"\n",
"# Save the encoders and scaler\n",
"joblib.dump(make_encoder, \"make_encoder.joblib\",compress=3)\n",
"joblib.dump(fuel_encoder, \"fuel_encoder.joblib\",compress=3)\n",
"joblib.dump(transmission_encoder, \"transmission_encoder.joblib\",compress=3)\n",
"joblib.dump(condition_encoder, \"condition_encoder.joblib\",compress=3)\n",
"joblib.dump(scaler, \"scaler.joblib\",compress=3)"
]
},
{
"cell_type": "markdown",
"id": "307eab41",
"metadata": {
"id": "307eab41"
},
"source": [
"#### Needed Model"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "23aaa0f7",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "23aaa0f7",
"outputId": "7ac3f946-76f2-4e32-bda3-84106fcec209"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Random Forest RMSE: 1900923.15\n",
"XGBoost RMSE: 1881430.11\n",
"Linear Regression RMSE: 3227815.24\n"
]
}
],
"source": [
"# Initialize the models\n",
"rf_model = RandomForestRegressor(random_state=42)\n",
"xgb_model = XGBRegressor(random_state=42)\n",
"lr_model = LinearRegression()\n",
"\n",
"# Fit the models on the training data\n",
"rf_model.fit(X_train, y_train)\n",
"xgb_model.fit(X_train, y_train)\n",
"lr_model.fit(X_train, y_train)\n",
"\n",
"# Make predictions on the testing data\n",
"rf_preds = rf_model.predict(X_test)\n",
"xgb_preds = xgb_model.predict(X_test)\n",
"lr_preds = lr_model.predict(X_test)\n",
"\n",
"# Evaluate the models using root mean squared error (RMSE)\n",
"rf_rmse = mean_squared_error(y_test, rf_preds, squared=False)\n",
"xgb_rmse = mean_squared_error(y_test, xgb_preds, squared=False)\n",
"lr_rmse = mean_squared_error(y_test, lr_preds, squared=False)\n",
"\n",
"# Print the RMSE scores\n",
"print(f\"Random Forest RMSE: {rf_rmse:.2f}\")\n",
"print(f\"XGBoost RMSE: {xgb_rmse:.2f}\")\n",
"print(f\"Linear Regression RMSE: {lr_rmse:.2f}\")"
]
},
{
"cell_type": "code",
"source": [
"# R2 score\n",
"rf_r2 = r2_score(y_test, rf_preds)\n",
"print(\"Random Forest R2 Score:\", rf_r2)\n",
"\n",
"\n",
"xgb_r2 = r2_score(y_test, xgb_preds)\n",
"print(\"XGBoost R2 Score:\", xgb_r2)\n",
"\n",
"\n",
"lr_r2 = r2_score(y_test, lr_preds)\n",
"print(\"Linear Regression R2 Score:\", lr_r2)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "HAij8ecNkQf4",
"outputId": "cfeb36b4-201b-413a-8b4f-ce722b9d7ef3"
},
"id": "HAij8ecNkQf4",
"execution_count": 40,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Random Forest R2 Score: 0.7692007346747749\n",
"XGBoost R2 Score: 0.7739099336774033\n",
"Linear Regression R2 Score: 0.33453895627915986\n"
]
}
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "f9dfda36",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "f9dfda36",
"outputId": "69882d26-6915-4f06-c5af-d38ce97417cd"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['car_model.joblib']"
]
},
"metadata": {},
"execution_count": 41
}
],
"source": [
"joblib.dump(xgb_model, \"car_model.joblib\", compress=3)"
]
},
{
"cell_type": "markdown",
"id": "faeff4c7",
"metadata": {
"id": "faeff4c7"
},
"source": [
"**Note: Many Models have been built, but only the needed ones were kept**"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "1b6ca9be",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 472
},
"id": "1b6ca9be",
"outputId": "a049c64e-ea4f-44d3-9bfb-4a03cc01a7cf"
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"