Spaces:
Runtime error
Runtime error
Commit
•
280110b
1
Parent(s):
c6371ba
initial commit
Browse files- .gitignore +3 -0
- MRT_LRT_STATION.csv +166 -0
- README.md +0 -12
- app.py +83 -0
- df_MRT.pkl +0 -0
- model.log +1 -0
- requirements.txt +2 -0
- utils.py +234 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
*.ipynb
|
3 |
+
|
MRT_LRT_STATION.csv
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Station
|
2 |
+
Jurong East MRT
|
3 |
+
Bukit Batok MRT
|
4 |
+
Bukit Gombak MRT
|
5 |
+
Choa Chu Kang MRT
|
6 |
+
Yew Tee MRT
|
7 |
+
Kranji MRT
|
8 |
+
Marsiling MRT
|
9 |
+
Woodlands MRT
|
10 |
+
Admiralty MRT
|
11 |
+
Sembawang MRT
|
12 |
+
Canberra MRT
|
13 |
+
Yishun MRT
|
14 |
+
Khatib MRT
|
15 |
+
Yio Chu Kang MRT
|
16 |
+
Ang Mo Kio MRT
|
17 |
+
Bishan MRT
|
18 |
+
Braddell MRT
|
19 |
+
Toa Payoh MRT
|
20 |
+
Novena MRT
|
21 |
+
Newton MRT
|
22 |
+
Orchard MRT
|
23 |
+
Somerset MRT
|
24 |
+
Dhoby Ghaut MRT
|
25 |
+
City Hall MRT
|
26 |
+
Raffles Place MRT
|
27 |
+
Marina Bay MRT
|
28 |
+
Marina South Pier MRT
|
29 |
+
Pasir Ris MRT
|
30 |
+
Tampines MRT
|
31 |
+
Simei MRT
|
32 |
+
Tanah Merah MRT
|
33 |
+
Bedok MRT
|
34 |
+
Kembangan MRT
|
35 |
+
Eunos MRT
|
36 |
+
Paya Lebar MRT
|
37 |
+
Aljunied MRT
|
38 |
+
Kallang MRT
|
39 |
+
Lavender MRT
|
40 |
+
Bugis MRT
|
41 |
+
Tanjong Pagar MRT
|
42 |
+
Outram Park MRT
|
43 |
+
Tiong Bahru MRT
|
44 |
+
Redhill MRT
|
45 |
+
Queenstown MRT
|
46 |
+
Commonwealth MRT
|
47 |
+
Buona Vista MRT
|
48 |
+
Dover MRT
|
49 |
+
Clementi MRT
|
50 |
+
Chinese Garden MRT
|
51 |
+
Lakeside MRT
|
52 |
+
Boon Lay MRT
|
53 |
+
Pioneer MRT
|
54 |
+
Joo Koon MRT
|
55 |
+
Gul Circle MRT
|
56 |
+
Tuas Crescent MRT
|
57 |
+
Tuas West Road MRT
|
58 |
+
Tuas Link
|
59 |
+
Expo MRT
|
60 |
+
Changi Airport MRT
|
61 |
+
HarbourFront MRT
|
62 |
+
Chinatown MRT
|
63 |
+
Clarke Quay MRT
|
64 |
+
Little India MRT
|
65 |
+
Farrer Park MRT
|
66 |
+
Boon Keng MRT
|
67 |
+
Potong Pasir MRT
|
68 |
+
Woodleigh MRT
|
69 |
+
Serangoon MRT
|
70 |
+
Kovan MRT
|
71 |
+
Hougang MRT
|
72 |
+
Buangkok MRT
|
73 |
+
Sengkang MRT
|
74 |
+
Punggol MRT
|
75 |
+
Bras Basah MRT
|
76 |
+
Esplanade MRT
|
77 |
+
Promenade MRT
|
78 |
+
Nicoll Highway MRT
|
79 |
+
Stadium MRT
|
80 |
+
Mountbatten MRT
|
81 |
+
Dakota MRT
|
82 |
+
MacPherson MRT
|
83 |
+
Tai Seng MRT
|
84 |
+
Bartley MRT
|
85 |
+
Lorong Chuan MRT
|
86 |
+
Marymount MRT
|
87 |
+
Caldecott MRT
|
88 |
+
Botanic Gardens MRT
|
89 |
+
Farrer Road MRT
|
90 |
+
Holland Village MRT
|
91 |
+
one-north MRT
|
92 |
+
Kent Ridge MRT
|
93 |
+
Haw Par Villa MRT
|
94 |
+
Pasir Panjang MRT
|
95 |
+
Labrador Park MRT
|
96 |
+
Telok Blangah MRT
|
97 |
+
Bayfront MRT
|
98 |
+
Bukit Panjang MRT
|
99 |
+
Cashew MRT
|
100 |
+
Hillview MRT
|
101 |
+
Beauty World MRT
|
102 |
+
King Albert Park MRT
|
103 |
+
Sixth Avenue MRT
|
104 |
+
Tan Kah Kee MRT
|
105 |
+
Stevens MRT
|
106 |
+
Rochor MRT
|
107 |
+
Downtown MRT
|
108 |
+
Telok Ayer MRT
|
109 |
+
Fort Canning MRT
|
110 |
+
Bencoolen MRT
|
111 |
+
Jalan Besar MRT
|
112 |
+
Bendemeer MRT
|
113 |
+
Geylang Bahru MRT
|
114 |
+
Mattar MRT
|
115 |
+
Ubi MRT
|
116 |
+
Kaki Bukit MRT
|
117 |
+
Bedok North MRT
|
118 |
+
Bedok Reservoir MRT
|
119 |
+
Tampines West MRT
|
120 |
+
Tampines East MRT
|
121 |
+
Upper Changi MRT
|
122 |
+
Woodlands North MRT
|
123 |
+
Woodlands South MRT
|
124 |
+
Springleaf MRT
|
125 |
+
Lentor MRT
|
126 |
+
Mayflower MRT
|
127 |
+
Bright Hill MRT
|
128 |
+
Upper Thomson MRT
|
129 |
+
South View LRT
|
130 |
+
Keat Hong LRT
|
131 |
+
Teck Whye LRT
|
132 |
+
Phoenix LRT
|
133 |
+
Petir LRT
|
134 |
+
Pending LRT
|
135 |
+
Bangkit LRT
|
136 |
+
Fajar LRT
|
137 |
+
Segar LRT
|
138 |
+
Jelapang LRT
|
139 |
+
Senja LRT
|
140 |
+
Ten Mile Junction LRT
|
141 |
+
Compassvale LRT
|
142 |
+
Rumbia LRT
|
143 |
+
Bakau LRT
|
144 |
+
Kangkar LRT
|
145 |
+
Ranggung LRT
|
146 |
+
Cheng Lim LRT
|
147 |
+
Farmway LRT
|
148 |
+
Kupang LRT
|
149 |
+
Thanggam LRT
|
150 |
+
Fernvale LRT
|
151 |
+
Layar LRT
|
152 |
+
Tongkang LRT
|
153 |
+
Renjong LRT
|
154 |
+
Cove LRT
|
155 |
+
Meridian LRT
|
156 |
+
Coral Edge LRT
|
157 |
+
Riviera LRT
|
158 |
+
Kadaloor LRT
|
159 |
+
Oasis LRT
|
160 |
+
Damai LRT
|
161 |
+
Sam Kee LRT
|
162 |
+
Punggol Point LRT
|
163 |
+
Samudera LRT
|
164 |
+
Nibong LRT
|
165 |
+
Sumang LRT
|
166 |
+
Soo Teck LRT
|
README.md
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
---
|
2 |
-
title: Resale_HDB_price_prediction_model
|
3 |
-
emoji: 🌍
|
4 |
-
colorFrom: pink
|
5 |
-
colorTo: red
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 2.9.4
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import utils
|
3 |
+
import pickle
|
4 |
+
import pandas as pd
|
5 |
+
import gradio as gr
|
6 |
+
|
7 |
+
def main(address :str, floor_area_sqft :int, remaining_lease_years :int, flat_type, storey_range, year_sold :int, update_model :bool):
|
8 |
+
|
9 |
+
model_file = 'model.sav'
|
10 |
+
|
11 |
+
flat_type_1, flat_type_2, flat_type_3, flat_type_4, flat_type_5, flat_type_6, flat_type_7 = 0, 0, 0 , 0, 0, 0, 0
|
12 |
+
storey_low, storey_mid, storey_high = 0, 0, 0
|
13 |
+
|
14 |
+
if flat_type == '1 ROOM':
|
15 |
+
flat_type_1 = 1,
|
16 |
+
|
17 |
+
elif flat_type == '2 ROOM':
|
18 |
+
flat_type_2 = 1,
|
19 |
+
|
20 |
+
elif flat_type == '3 ROOM':
|
21 |
+
flat_type_3 = 1,
|
22 |
+
|
23 |
+
elif flat_type == '4 ROOM':
|
24 |
+
flat_type_4 = 1,
|
25 |
+
|
26 |
+
elif flat_type == '5 ROOM':
|
27 |
+
flat_type_5 = 1,
|
28 |
+
|
29 |
+
elif flat_type == 'EXECUTIVE':
|
30 |
+
flat_type_6 = 1,
|
31 |
+
|
32 |
+
elif flat_type == 'MULTI-GENERATION':
|
33 |
+
flat_type_7 = 1,
|
34 |
+
|
35 |
+
if storey_range == 'LOW FLOOR':
|
36 |
+
storey_low = 1,
|
37 |
+
|
38 |
+
elif storey_range == 'MID FLOOR':
|
39 |
+
storey_mid = 1,
|
40 |
+
|
41 |
+
elif storey_range == 'HIGH FLOOR':
|
42 |
+
storey_high = 1,
|
43 |
+
|
44 |
+
# loading model
|
45 |
+
if update_model:
|
46 |
+
utils.get_update(year_sold)
|
47 |
+
if os.path.exists("./model.sav"):
|
48 |
+
model = pickle.load(open(model_file, 'rb'))
|
49 |
+
input_dict = pd.DataFrame({
|
50 |
+
'floor_area_sqft': floor_area_sqft,
|
51 |
+
'remaining_lease_years': remaining_lease_years,
|
52 |
+
'distance_to_nearest_MRT_station': utils.distance_to_nearest_MRT_station(address),
|
53 |
+
'distance_to_city': utils.distance_to_city(address),
|
54 |
+
'flat_type_1 ROOM': flat_type_1,
|
55 |
+
'flat_type_2 ROOM': flat_type_2,
|
56 |
+
'flat_type_3 ROOM': flat_type_3,
|
57 |
+
'flat_type_4 ROOM': flat_type_4,
|
58 |
+
'flat_type_5 ROOM': flat_type_5,
|
59 |
+
'flat_type_EXECUTIVE': flat_type_6,
|
60 |
+
'flat_type_MULTI-GENERATION': flat_type_7,
|
61 |
+
'storey_range_High Floor': storey_high,
|
62 |
+
'storey_range_Low Floor': storey_low,
|
63 |
+
'storey_range_Mid Floor': storey_mid},
|
64 |
+
index = [0])
|
65 |
+
|
66 |
+
return ('Predicted PSF:', round(model.predict(input_dict).item(),2), '\nPredicted Property Value:', round(model.predict(input_dict).item(),2) * floor_area_sqft)
|
67 |
+
|
68 |
+
else:
|
69 |
+
return ('ERROR: No saved model')
|
70 |
+
|
71 |
+
iface = gr.Interface(
|
72 |
+
fn = main,
|
73 |
+
inputs = [
|
74 |
+
gr.inputs.Textbox(lines=2, placeholder= "Example: 88 dawson road or Singapore 142088", default=None, label="Address"),
|
75 |
+
gr.inputs.Number(default=893, label='Floor Area (sqft)', optional=False),
|
76 |
+
gr.inputs.Number(default=None, label='Remaining Lease (years)', optional=False),
|
77 |
+
gr.inputs.Dropdown(choices=['1 ROOM', '2 ROOM', '3 ROOM', '4 ROOM', '5 ROOM', 'EXECUTIVE', 'MULTI-GENERATION'], type="value", default=None, label="Flat Type"),
|
78 |
+
gr.inputs.Dropdown(choices=['LOW FLOOR', 'MID FLOOR', 'HIGH FLOOR'], type="value", default=None, label="Storey Range"),
|
79 |
+
gr.inputs.Number(default=2020, label='Data from:', optional=False),
|
80 |
+
gr.inputs.Checkbox(default=False, label="Update Model?", optional=False)
|
81 |
+
],
|
82 |
+
outputs = [gr.outputs.Textbox(type="auto", label='Predicted Price per SQFT')])
|
83 |
+
iface.launch()
|
df_MRT.pkl
ADDED
Binary file (49.5 kB). View file
|
|
model.log
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
model_score = 32.55912733816058
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
geopy
|
2 |
+
sklearn
|
utils.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import pickle
|
3 |
+
import requests
|
4 |
+
import geopy.distance
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
|
9 |
+
from sklearn.metrics import mean_squared_error, mean_absolute_error
|
10 |
+
from sklearn.model_selection import train_test_split
|
11 |
+
from sklearn.pipeline import make_pipeline
|
12 |
+
from sklearn.preprocessing import StandardScaler
|
13 |
+
from sklearn.ensemble import RandomForestRegressor
|
14 |
+
|
15 |
+
def get_data_data_gov(url):
|
16 |
+
query_string = url
|
17 |
+
resp = requests.get(query_string)
|
18 |
+
data = json.loads(resp.content) # convert from json to python dict
|
19 |
+
print('Number of records:', len(data.get('result').get('records')))
|
20 |
+
data = data['result']['records']
|
21 |
+
df = pd.DataFrame.from_dict(data).drop(['_id'], axis=1)
|
22 |
+
# print(df.isnull().sum())
|
23 |
+
# print(df.dtypes)
|
24 |
+
return df
|
25 |
+
|
26 |
+
def get_data_singstat_price_index(url):
|
27 |
+
query_string = url
|
28 |
+
resp = requests.get(query_string)
|
29 |
+
data = json.loads(resp.content) # convert from json to python dict
|
30 |
+
print('Number of records:', len(data.get('Data').get('row')[0].get('columns')))
|
31 |
+
df = pd.DataFrame.from_dict(data.get('Data').get('row')[0].get('columns'))
|
32 |
+
# print(df.isnull().sum())
|
33 |
+
# print(df.dtypes)
|
34 |
+
return df
|
35 |
+
|
36 |
+
def get_data_one_map(address, is_LAT_LONG_only=False):
|
37 |
+
query_string = 'https://developers.onemap.sg/commonapi/search?searchVal='+str(address)+'&returnGeom=Y&getAddrDetails=Y'
|
38 |
+
resp = requests.get(query_string)
|
39 |
+
data = json.loads(resp.content) # convert from json to python dict
|
40 |
+
if data['found'] != 0 and is_LAT_LONG_only:
|
41 |
+
data = data['results'][0]
|
42 |
+
data = (data['LATITUDE'], data['LONGITUDE'])
|
43 |
+
elif data['found'] != 0:
|
44 |
+
data = data['results'][0]
|
45 |
+
else:
|
46 |
+
data = None
|
47 |
+
return data
|
48 |
+
|
49 |
+
def distance_to_city(address):
|
50 |
+
hdb_coordinates = get_data_one_map(address, is_LAT_LONG_only = True)
|
51 |
+
return geopy.distance.great_circle((1.29293672227779, 103.852585580366), hdb_coordinates).km
|
52 |
+
|
53 |
+
def distance_to_nearest_MRT_station(address):
|
54 |
+
hdb_coordinates = get_data_one_map(address, is_LAT_LONG_only = True)
|
55 |
+
df_MRT = pd.read_pickle('./df_MRT.pkl')
|
56 |
+
MRT_coordinates = df_MRT.T.iloc[-1,:].tolist()
|
57 |
+
dist = []
|
58 |
+
for coordinates in MRT_coordinates:
|
59 |
+
dist.append(geopy.distance.great_circle(hdb_coordinates, coordinates).km)
|
60 |
+
return min(dist)
|
61 |
+
|
62 |
+
def month_to_quarter(x):
|
63 |
+
year = int(x.split('-')[0])
|
64 |
+
month = int(x.split('-')[1])
|
65 |
+
if month <= 3:
|
66 |
+
month = '1Q'
|
67 |
+
elif month <= 6:
|
68 |
+
month = '2Q'
|
69 |
+
elif month <= 9:
|
70 |
+
month = '3Q'
|
71 |
+
else:
|
72 |
+
month = '4Q'
|
73 |
+
return (str(year) + '-' + str(month))
|
74 |
+
|
75 |
+
def get_update(data_year):
|
76 |
+
### DATA EXTRACTION AND PREPROCESSING ###
|
77 |
+
df_raw_data = get_data_data_gov('https://data.gov.sg/api/action/datastore_search?resource_id=f1765b54-a209-4718-8d38-a39237f502b3&limit=1000000')
|
78 |
+
df_raw_data['address'] = df_raw_data['block'] + ' ' + df_raw_data['street_name']
|
79 |
+
df_raw_data['quarter'] = df_raw_data['month'].apply(month_to_quarter)
|
80 |
+
df_raw_data['year_sold'] = df_raw_data['month'].apply((lambda x: int(x.split('-')[0])))
|
81 |
+
df_raw_data['month_sold'] = df_raw_data['month'].apply((lambda x: int(x.split('-')[1])))
|
82 |
+
df_raw_data['remaining_lease_years'] = df_raw_data['remaining_lease'].apply(lambda x: int(x.split()[0]))
|
83 |
+
df_raw_data['floor_area_sqft'] = round((df_raw_data['floor_area_sqm'].astype(float))*10.764)
|
84 |
+
|
85 |
+
df_price_index = get_data_singstat_price_index('https://tablebuilder.singstat.gov.sg/api/table/tabledata/M212161?isTestApi=true')
|
86 |
+
df_price_index = df_price_index.rename(columns = {'key':'quarter', 'value': 'index'})
|
87 |
+
df_price_index['quarter'] = df_price_index['quarter'].apply(lambda x : x.replace(' ', '-'))
|
88 |
+
|
89 |
+
df_selected_year = df_raw_data[df_raw_data['year_sold']>=data_year]
|
90 |
+
quarter_list = list(df_price_index['quarter'])
|
91 |
+
df_selected_year['quarter'] = df_selected_year['quarter'].apply(lambda x : x if x in quarter_list else quarter_list[-1])
|
92 |
+
df_selected_year = pd.merge(df_selected_year, df_price_index, how='left', on='quarter')
|
93 |
+
|
94 |
+
# convert to float
|
95 |
+
df_selected_year['index'] = pd.to_numeric(df_selected_year['index'])
|
96 |
+
df_selected_year['resale_price'] = pd.to_numeric(df_selected_year['resale_price'])
|
97 |
+
|
98 |
+
# normalised to latest price index
|
99 |
+
df_selected_year['normalised_resale_price'] = round(df_selected_year['resale_price']*float(df_price_index.tail(1)['index'])/df_selected_year['index'],0)
|
100 |
+
df_selected_year['price_psf'] = round(df_selected_year['normalised_resale_price']/df_selected_year['floor_area_sqft'])
|
101 |
+
|
102 |
+
df_selected_year['storey_range'] = df_selected_year['storey_range'] \
|
103 |
+
.str.replace('01 TO 03', 'Low Floor') \
|
104 |
+
.str.replace('04 TO 06', 'Mid Floor') \
|
105 |
+
.str.replace('07 TO 09', 'Mid Floor') \
|
106 |
+
.str.replace('10 TO 12', 'High Floor') \
|
107 |
+
.str.replace('13 TO 15', 'High Floor') \
|
108 |
+
.str.replace('16 TO 18', 'High Floor') \
|
109 |
+
.str.replace('19 TO 21', 'High Floor') \
|
110 |
+
.str.replace('22 TO 24', 'High Floor') \
|
111 |
+
.str.replace('25 TO 27', 'High Floor') \
|
112 |
+
.str.replace('25 TO 27', 'High Floor') \
|
113 |
+
.str.replace('25 TO 27', 'High Floor') \
|
114 |
+
.str.replace('28 TO 30', 'High Floor') \
|
115 |
+
.str.replace('31 TO 33', 'High Floor') \
|
116 |
+
.str.replace('34 TO 36', 'High Floor') \
|
117 |
+
.str.replace('37 TO 39', 'High Floor') \
|
118 |
+
.str.replace('40 TO 42', 'High Floor') \
|
119 |
+
.str.replace('43 TO 45', 'High Floor') \
|
120 |
+
.str.replace('46 TO 48', 'High Floor') \
|
121 |
+
.str.replace('49 TO 51', 'High Floor')
|
122 |
+
|
123 |
+
df_selected_year = df_selected_year.drop(columns=['street_name', 'resale_price', 'remaining_lease', 'lease_commence_date', 'block'])
|
124 |
+
HDB_address_list = df_selected_year['address'].unique().tolist()
|
125 |
+
|
126 |
+
data_list = []
|
127 |
+
|
128 |
+
for i in range(0, len(HDB_address_list)):
|
129 |
+
data = get_data_one_map(HDB_address_list[i])
|
130 |
+
if data is not None:
|
131 |
+
data_list.append(data)
|
132 |
+
|
133 |
+
df_HDB = pd.DataFrame.from_dict(data_list)
|
134 |
+
#creating primary key for df_HDB for further table join
|
135 |
+
df_HDB['LAT_LONG']= (df_HDB['LATITUDE'] +' '+ df_HDB['LONGITUDE']).apply(lambda x: tuple(x.split(' ')))
|
136 |
+
|
137 |
+
tmp_df = pd.read_csv('./MRT_LRT_STATION.csv')
|
138 |
+
MRT_list = tmp_df['Station'].unique().tolist()
|
139 |
+
print('Number of MRT stations:', len(MRT_list))
|
140 |
+
|
141 |
+
data_list = []
|
142 |
+
|
143 |
+
for i in range(0, len(MRT_list)):
|
144 |
+
data = get_data_one_map(MRT_list[i])
|
145 |
+
if data is not None:
|
146 |
+
data_list.append(data)
|
147 |
+
|
148 |
+
df_MRT = pd.DataFrame.from_dict(data_list)
|
149 |
+
df_MRT['LAT_LONG'] = (df_MRT['LATITUDE'] +' '+ df_MRT['LONGITUDE']).apply(lambda x: tuple(x.split(' ')))
|
150 |
+
df_MRT.to_pickle('df_MRT.pkl')
|
151 |
+
|
152 |
+
MRT_coordinates = df_MRT.T.iloc[-1,:].tolist()
|
153 |
+
df_HDB_coordinates = pd.DataFrame(df_HDB['LAT_LONG'])
|
154 |
+
|
155 |
+
df_HDB_coordinates = pd.DataFrame(df_HDB['LAT_LONG'])
|
156 |
+
for coordinates in MRT_coordinates:
|
157 |
+
df_HDB_coordinates[coordinates]=df_HDB['LAT_LONG'].apply(lambda y: geopy.distance.great_circle(y, coordinates).km)
|
158 |
+
|
159 |
+
# get the distance from each address to the nearest station
|
160 |
+
df_HDB_coordinates['distance_to_nearest_MRT_station'] = df_HDB_coordinates.iloc[:,1:].apply(lambda x: min(x), axis=1)
|
161 |
+
df_HDB_coordinates_with_MRT_distance = df_HDB_coordinates.iloc[:,[0,-1]]
|
162 |
+
|
163 |
+
df_HDB = pd.merge(df_HDB, df_HDB_coordinates_with_MRT_distance, on='LAT_LONG', how='left')
|
164 |
+
df_HDB = df_HDB.drop(columns=['SEARCHVAL', 'BUILDING', 'ADDRESS' ,'X', 'Y', 'LONGTITUDE'])
|
165 |
+
|
166 |
+
# City Hall: 1.29293672227779 103.852585580366
|
167 |
+
df_HDB['distance_to_city'] = df_HDB['LAT_LONG'].apply(lambda x: geopy.distance.great_circle((1.29293672227779, 103.852585580366), x).km)
|
168 |
+
|
169 |
+
df_HDB['address'] = df_HDB ['BLK_NO'] + ' ' + df_HDB ['ROAD_NAME']
|
170 |
+
|
171 |
+
df_HDB['address'] = df_HDB['address'] \
|
172 |
+
.str.replace('AVENUE', 'AVE') \
|
173 |
+
.str.replace('CRESCENT', 'CRES') \
|
174 |
+
.str.replace('ROAD', 'RD') \
|
175 |
+
.str.replace('STREET', 'ST') \
|
176 |
+
.str.replace('CENTRAL', 'CTRL') \
|
177 |
+
.str.replace('HEIGHTS', 'HTS') \
|
178 |
+
.str.replace('TERRACE', 'TER') \
|
179 |
+
.str.replace('JALAN', 'JLN') \
|
180 |
+
.str.replace('DRIVE', 'DR') \
|
181 |
+
.str.replace('PLACE', 'PL') \
|
182 |
+
.str.replace('CLOSE', 'CL') \
|
183 |
+
.str.replace('PARK', 'PK') \
|
184 |
+
.str.replace('GARDENS', 'GDNS') \
|
185 |
+
.str.replace('NORTH', 'NTH') \
|
186 |
+
.str.replace('SOUTH', 'STH') \
|
187 |
+
.str.replace('BUKIT', 'BT') \
|
188 |
+
.str.replace('UPPER', 'UPP}') \
|
189 |
+
.str.replace('COMMONWEALTH', "C'WEALTH")
|
190 |
+
|
191 |
+
df_clean_data = pd.merge(df_selected_year, df_HDB, on='address', how='left')
|
192 |
+
df_clean_data = df_clean_data.dropna(subset=['LAT_LONG'])
|
193 |
+
|
194 |
+
### FEATURE SELECTION ###
|
195 |
+
features = [
|
196 |
+
'flat_type',
|
197 |
+
'storey_range',
|
198 |
+
'floor_area_sqft',
|
199 |
+
'remaining_lease_years',
|
200 |
+
'distance_to_nearest_MRT_station',
|
201 |
+
'distance_to_city',
|
202 |
+
'price_psf']
|
203 |
+
|
204 |
+
df = df_clean_data[features]
|
205 |
+
df = pd.get_dummies(df)
|
206 |
+
|
207 |
+
### MODEL TRAINING AND TESTING ###
|
208 |
+
X = df.drop('price_psf', axis=1)
|
209 |
+
y = df['price_psf']
|
210 |
+
|
211 |
+
print('Average price_per_sqm:', y.mean())
|
212 |
+
print('Min price_per_sqm:', y.min())
|
213 |
+
print('Max price_per_sqm:', y.max())
|
214 |
+
|
215 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
216 |
+
|
217 |
+
model_RFR = make_pipeline(StandardScaler(),RandomForestRegressor())
|
218 |
+
model_RFR.fit(X_train, y_train) # apply scaling on training data
|
219 |
+
model_RFR.score(X_test, y_test)
|
220 |
+
|
221 |
+
print('Mean Absolute Error:', mean_absolute_error(y_test, model_RFR.predict(X_test)))
|
222 |
+
print('Mean Squared Error:', mean_squared_error(y_test, model_RFR.predict(X_test)))
|
223 |
+
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, model_RFR.predict(X_test))))
|
224 |
+
|
225 |
+
f = open("model.log", "r")
|
226 |
+
data = f.readline()
|
227 |
+
model_score = float(data.split()[-1])
|
228 |
+
MSE = np.sqrt(mean_squared_error(y_test, model_RFR.predict(X_test)))
|
229 |
+
|
230 |
+
if mean_squared_error(y_test, model_RFR.predict(X_test)) > model_score:
|
231 |
+
pickle.dump(model_RFR, open('model.sav', 'wb'))
|
232 |
+
f = open("model.log", "w")
|
233 |
+
f.write(f'model_score = {MSE}')
|
234 |
+
f.close()
|