File size: 4,603 Bytes
9b968be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Load dataset
file_path = "CAR/CTP_Model1.csv"
data = pd.read_csv(file_path, low_memory=False)

# Function to remove outliers using IQR
def remove_outliers_iqr(df, column, multiplier=1.5):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers and unrealistic prices
data = remove_outliers_iqr(data, 'price', multiplier=2)
data = data[data['price'] > 100]

# Feature engineering
def create_features(df):
    df = df.copy()
    current_year = 2024
    df['age'] = current_year - df['year']
    df['age_squared'] = df['age'] ** 2
    df['mileage_per_year'] = np.clip(df['odometer'] / (df['age'] + 1), 0, 200000)
    return df

data = create_features(data)

# Handle categorical features
categorical_features = ['make', 'model', 'condition', 'fuel', 'title_status', 
                        'transmission', 'drive', 'size', 'type', 'paint_color']

label_encoders = {}
encoding_dict = {}  # To save mappings for the app

for feature in categorical_features:
    if feature in data.columns:
        le = LabelEncoder()
        data[feature] = le.fit_transform(data[feature])
        label_encoders[feature] = le
        # Save mapping for later use
        encoding_dict[feature] = dict(zip(le.classes_, le.transform(le.classes_)))

# Save the encoding dictionary to a CSV
encoding_df = pd.DataFrame.from_dict(encoding_dict, orient='index').transpose()
encoding_df.to_csv("categorical_encodings.csv", index=False)

# Prepare features and labels
numeric_features = ['year', 'odometer', 'age', 'age_squared', 'mileage_per_year']
features = numeric_features + categorical_features
X = data[features]
y = np.log1p(data['price'])  # Log-transform the price for better model performance

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with scaling and regression
model = Pipeline([
    ('scaler', RobustScaler()),
    ('regressor', RandomForestRegressor(
        n_estimators=300, max_depth=25, random_state=42, n_jobs=-1))
])

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")

# Save the model and encoders
joblib.dump(model, "car_price_modelv3.pkl")
print("Model saved successfully.")

viz_path = '/Users/estebanm/Desktop/carShopping_tool/CAR/visualizations'
os.makedirs(viz_path, exist_ok=True)

# 1. Price Distribution Plot
plt.figure(figsize=(10, 6))
sns.histplot(data=data, x='price', bins=50)
plt.title('Price Distribution')
plt.savefig(os.path.join(viz_path, 'price_distribution_plot.png'))
plt.close()

# 2. Actual vs Predicted Plot
actual_prices = np.expm1(y_test)
predicted_prices = np.expm1(y_pred)

plt.figure(figsize=(10, 6))
plt.scatter(actual_prices, predicted_prices, alpha=0.5)
plt.plot([actual_prices.min(), actual_prices.max()], [actual_prices.min(), actual_prices.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')
plt.savefig(os.path.join(viz_path, 'actual_vs_predicted_scatter.png'))
plt.close()

# 3. Feature Importance Plot
feature_importance = model.named_steps['regressor'].feature_importances_
feature_names = numeric_features + categorical_features

plt.figure(figsize=(12, 6))
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
importance_df = importance_df.sort_values('importance', ascending=True)
plt.barh(importance_df['feature'], importance_df['importance'])
plt.title('Feature Importance')
plt.savefig(os.path.join(viz_path, 'feature_importance_plot.png'))
plt.close()

# 4. Residuals Distribution Plot
residuals = actual_prices - predicted_prices
plt.figure(figsize=(10, 6))
sns.histplot(residuals, bins=50)
plt.title('Residuals Distribution')
plt.xlabel('Residuals')
plt.savefig(os.path.join(viz_path, 'residuals_distribution_plot.png'))
plt.close()