File size: 2,891 Bytes
25a7261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# -*- coding: utf-8 -*-
"""dynamic pricing.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1pMuvzwELNm1DsTdL5dfBdA2HCjB6uwgh
"""

# Commented out IPython magic to ensure Python compatibility.
import datetime

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

dataset = pd.read_csv("Pop_Data.csv")
dataset.head(5)

X_train, X_test, y_train, y_test = train_test_split(dataset.iloc[:, :-1],
                                                    dataset.iloc[:, -1],
                                                    test_size = 0.3,
                                                    random_state = 42)

X_train.info()

"""# EDA"""

X_train = X_train.iloc[:, 3:]
X_test = X_test.iloc[:, 3:]

X_train.info

plt.figure(figsize = (12, 8))
plot = sns.countplot(x = 'day_of_week', data = X_train)
plt.xticks(rotation = 90)
for p in plot.patches:
    plot.annotate(p.get_height(),
                        (p.get_x() + p.get_width() / 2.0,
                         p.get_height()),
                        ha = 'center',
                        va = 'center',
                        xytext = (0, 5),
                        textcoords = 'offset points')

plt.title("Price changes based on day")
plt.xlabel("Day")
plt.ylabel("Price")

print(sum(X_train["day_of_week"].isnull()))
print(sum(X_test["day_of_week"].isnull()))

print(sum(X_train["hour_of_day"].isnull()))
print(sum(X_test["hour_of_day"].isnull()))

print(sum(X_train["popularity_percent_normal"].isnull()))
print(sum(X_test["popularity_percent_normal"].isnull()))

X_train["popularity_percent_normal"].fillna(X_train["popularity_percent_normal"].astype("float64").mean(), inplace = True)

X_train = pd.get_dummies(X_train,
                         columns = ["day_of_week"],
                         drop_first = True)

X_test = pd.get_dummies(X_test,
                         columns = ["day_of_week"],
                         drop_first = True)

missing_cols = set(X_train.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0
X_test = X_test[X_train.columns]

standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train = standardScaler.transform(X_train)
X_test = standardScaler.transform(X_test)

linearRegression = LinearRegression()
linearRegression.fit(X_train, y_train)
y_pred = linearRegression.predict(X_test)
r2_score(y_test, y_pred)

rf = RandomForestRegressor(n_estimators = 100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
r2_score(y_test, y_pred)

def predict(x_test):
    return rf.predict(x_test)