jamesnzeex commited on
Commit
280110b
1 Parent(s): c6371ba

initial commit

Browse files
Files changed (8) hide show
  1. .gitignore +3 -0
  2. MRT_LRT_STATION.csv +166 -0
  3. README.md +0 -12
  4. app.py +83 -0
  5. df_MRT.pkl +0 -0
  6. model.log +1 -0
  7. requirements.txt +2 -0
  8. utils.py +234 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__
2
+ *.ipynb
3
+
MRT_LRT_STATION.csv ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Station
2
+ Jurong East MRT
3
+ Bukit Batok MRT
4
+ Bukit Gombak MRT
5
+ Choa Chu Kang MRT
6
+ Yew Tee MRT
7
+ Kranji MRT
8
+ Marsiling MRT
9
+ Woodlands MRT
10
+ Admiralty MRT
11
+ Sembawang MRT
12
+ Canberra MRT
13
+ Yishun MRT
14
+ Khatib MRT
15
+ Yio Chu Kang MRT
16
+ Ang Mo Kio MRT
17
+ Bishan MRT
18
+ Braddell MRT
19
+ Toa Payoh MRT
20
+ Novena MRT
21
+ Newton MRT
22
+ Orchard MRT
23
+ Somerset MRT
24
+ Dhoby Ghaut MRT
25
+ City Hall MRT
26
+ Raffles Place MRT
27
+ Marina Bay MRT
28
+ Marina South Pier MRT
29
+ Pasir Ris MRT
30
+ Tampines MRT
31
+ Simei MRT
32
+ Tanah Merah MRT
33
+ Bedok MRT
34
+ Kembangan MRT
35
+ Eunos MRT
36
+ Paya Lebar MRT
37
+ Aljunied MRT
38
+ Kallang MRT
39
+ Lavender MRT
40
+ Bugis MRT
41
+ Tanjong Pagar MRT
42
+ Outram Park MRT
43
+ Tiong Bahru MRT
44
+ Redhill MRT
45
+ Queenstown MRT
46
+ Commonwealth MRT
47
+ Buona Vista MRT
48
+ Dover MRT
49
+ Clementi MRT
50
+ Chinese Garden MRT
51
+ Lakeside MRT
52
+ Boon Lay MRT
53
+ Pioneer MRT
54
+ Joo Koon MRT
55
+ Gul Circle MRT
56
+ Tuas Crescent MRT
57
+ Tuas West Road MRT
58
+ Tuas Link
59
+ Expo MRT
60
+ Changi Airport MRT
61
+ HarbourFront MRT
62
+ Chinatown MRT
63
+ Clarke Quay MRT
64
+ Little India MRT
65
+ Farrer Park MRT
66
+ Boon Keng MRT
67
+ Potong Pasir MRT
68
+ Woodleigh MRT
69
+ Serangoon MRT
70
+ Kovan MRT
71
+ Hougang MRT
72
+ Buangkok MRT
73
+ Sengkang MRT
74
+ Punggol MRT
75
+ Bras Basah MRT
76
+ Esplanade MRT
77
+ Promenade MRT
78
+ Nicoll Highway MRT
79
+ Stadium MRT
80
+ Mountbatten MRT
81
+ Dakota MRT
82
+ MacPherson MRT
83
+ Tai Seng MRT
84
+ Bartley MRT
85
+ Lorong Chuan MRT
86
+ Marymount MRT
87
+ Caldecott MRT
88
+ Botanic Gardens MRT
89
+ Farrer Road MRT
90
+ Holland Village MRT
91
+ one-north MRT
92
+ Kent Ridge MRT
93
+ Haw Par Villa MRT
94
+ Pasir Panjang MRT
95
+ Labrador Park MRT
96
+ Telok Blangah MRT
97
+ Bayfront MRT
98
+ Bukit Panjang MRT
99
+ Cashew MRT
100
+ Hillview MRT
101
+ Beauty World MRT
102
+ King Albert Park MRT
103
+ Sixth Avenue MRT
104
+ Tan Kah Kee MRT
105
+ Stevens MRT
106
+ Rochor MRT
107
+ Downtown MRT
108
+ Telok Ayer MRT
109
+ Fort Canning MRT
110
+ Bencoolen MRT
111
+ Jalan Besar MRT
112
+ Bendemeer MRT
113
+ Geylang Bahru MRT
114
+ Mattar MRT
115
+ Ubi MRT
116
+ Kaki Bukit MRT
117
+ Bedok North MRT
118
+ Bedok Reservoir MRT
119
+ Tampines West MRT
120
+ Tampines East MRT
121
+ Upper Changi MRT
122
+ Woodlands North MRT
123
+ Woodlands South MRT
124
+ Springleaf MRT
125
+ Lentor MRT
126
+ Mayflower MRT
127
+ Bright Hill MRT
128
+ Upper Thomson MRT
129
+ South View LRT
130
+ Keat Hong LRT
131
+ Teck Whye LRT
132
+ Phoenix LRT
133
+ Petir LRT
134
+ Pending LRT
135
+ Bangkit LRT
136
+ Fajar LRT
137
+ Segar LRT
138
+ Jelapang LRT
139
+ Senja LRT
140
+ Ten Mile Junction LRT
141
+ Compassvale LRT
142
+ Rumbia LRT
143
+ Bakau LRT
144
+ Kangkar LRT
145
+ Ranggung LRT
146
+ Cheng Lim LRT
147
+ Farmway LRT
148
+ Kupang LRT
149
+ Thanggam LRT
150
+ Fernvale LRT
151
+ Layar LRT
152
+ Tongkang LRT
153
+ Renjong LRT
154
+ Cove LRT
155
+ Meridian LRT
156
+ Coral Edge LRT
157
+ Riviera LRT
158
+ Kadaloor LRT
159
+ Oasis LRT
160
+ Damai LRT
161
+ Sam Kee LRT
162
+ Punggol Point LRT
163
+ Samudera LRT
164
+ Nibong LRT
165
+ Sumang LRT
166
+ Soo Teck LRT
README.md DELETED
@@ -1,12 +0,0 @@
1
- ---
2
- title: Resale_HDB_price_prediction_model
3
- emoji: 🌍
4
- colorFrom: pink
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 2.9.4
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import utils
3
+ import pickle
4
+ import pandas as pd
5
+ import gradio as gr
6
+
7
+ def main(address :str, floor_area_sqft :int, remaining_lease_years :int, flat_type, storey_range, year_sold :int, update_model :bool):
8
+
9
+ model_file = 'model.sav'
10
+
11
+ flat_type_1, flat_type_2, flat_type_3, flat_type_4, flat_type_5, flat_type_6, flat_type_7 = 0, 0, 0 , 0, 0, 0, 0
12
+ storey_low, storey_mid, storey_high = 0, 0, 0
13
+
14
+ if flat_type == '1 ROOM':
15
+ flat_type_1 = 1,
16
+
17
+ elif flat_type == '2 ROOM':
18
+ flat_type_2 = 1,
19
+
20
+ elif flat_type == '3 ROOM':
21
+ flat_type_3 = 1,
22
+
23
+ elif flat_type == '4 ROOM':
24
+ flat_type_4 = 1,
25
+
26
+ elif flat_type == '5 ROOM':
27
+ flat_type_5 = 1,
28
+
29
+ elif flat_type == 'EXECUTIVE':
30
+ flat_type_6 = 1,
31
+
32
+ elif flat_type == 'MULTI-GENERATION':
33
+ flat_type_7 = 1,
34
+
35
+ if storey_range == 'LOW FLOOR':
36
+ storey_low = 1,
37
+
38
+ elif storey_range == 'MID FLOOR':
39
+ storey_mid = 1,
40
+
41
+ elif storey_range == 'HIGH FLOOR':
42
+ storey_high = 1,
43
+
44
+ # loading model
45
+ if update_model:
46
+ utils.get_update(year_sold)
47
+ if os.path.exists("./model.sav"):
48
+ model = pickle.load(open(model_file, 'rb'))
49
+ input_dict = pd.DataFrame({
50
+ 'floor_area_sqft': floor_area_sqft,
51
+ 'remaining_lease_years': remaining_lease_years,
52
+ 'distance_to_nearest_MRT_station': utils.distance_to_nearest_MRT_station(address),
53
+ 'distance_to_city': utils.distance_to_city(address),
54
+ 'flat_type_1 ROOM': flat_type_1,
55
+ 'flat_type_2 ROOM': flat_type_2,
56
+ 'flat_type_3 ROOM': flat_type_3,
57
+ 'flat_type_4 ROOM': flat_type_4,
58
+ 'flat_type_5 ROOM': flat_type_5,
59
+ 'flat_type_EXECUTIVE': flat_type_6,
60
+ 'flat_type_MULTI-GENERATION': flat_type_7,
61
+ 'storey_range_High Floor': storey_high,
62
+ 'storey_range_Low Floor': storey_low,
63
+ 'storey_range_Mid Floor': storey_mid},
64
+ index = [0])
65
+
66
+ return ('Predicted PSF:', round(model.predict(input_dict).item(),2), '\nPredicted Property Value:', round(model.predict(input_dict).item(),2) * floor_area_sqft)
67
+
68
+ else:
69
+ return ('ERROR: No saved model')
70
+
71
+ iface = gr.Interface(
72
+ fn = main,
73
+ inputs = [
74
+ gr.inputs.Textbox(lines=2, placeholder= "Example: 88 dawson road or Singapore 142088", default=None, label="Address"),
75
+ gr.inputs.Number(default=893, label='Floor Area (sqft)', optional=False),
76
+ gr.inputs.Number(default=None, label='Remaining Lease (years)', optional=False),
77
+ gr.inputs.Dropdown(choices=['1 ROOM', '2 ROOM', '3 ROOM', '4 ROOM', '5 ROOM', 'EXECUTIVE', 'MULTI-GENERATION'], type="value", default=None, label="Flat Type"),
78
+ gr.inputs.Dropdown(choices=['LOW FLOOR', 'MID FLOOR', 'HIGH FLOOR'], type="value", default=None, label="Storey Range"),
79
+ gr.inputs.Number(default=2020, label='Data from:', optional=False),
80
+ gr.inputs.Checkbox(default=False, label="Update Model?", optional=False)
81
+ ],
82
+ outputs = [gr.outputs.Textbox(type="auto", label='Predicted Price per SQFT')])
83
+ iface.launch()
df_MRT.pkl ADDED
Binary file (49.5 kB). View file
 
model.log ADDED
@@ -0,0 +1 @@
 
 
1
+ model_score = 32.55912733816058
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ geopy
2
+ sklearn
utils.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pickle
3
+ import requests
4
+ import geopy.distance
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ from sklearn.metrics import mean_squared_error, mean_absolute_error
10
+ from sklearn.model_selection import train_test_split
11
+ from sklearn.pipeline import make_pipeline
12
+ from sklearn.preprocessing import StandardScaler
13
+ from sklearn.ensemble import RandomForestRegressor
14
+
15
+ def get_data_data_gov(url):
16
+ query_string = url
17
+ resp = requests.get(query_string)
18
+ data = json.loads(resp.content) # convert from json to python dict
19
+ print('Number of records:', len(data.get('result').get('records')))
20
+ data = data['result']['records']
21
+ df = pd.DataFrame.from_dict(data).drop(['_id'], axis=1)
22
+ # print(df.isnull().sum())
23
+ # print(df.dtypes)
24
+ return df
25
+
26
+ def get_data_singstat_price_index(url):
27
+ query_string = url
28
+ resp = requests.get(query_string)
29
+ data = json.loads(resp.content) # convert from json to python dict
30
+ print('Number of records:', len(data.get('Data').get('row')[0].get('columns')))
31
+ df = pd.DataFrame.from_dict(data.get('Data').get('row')[0].get('columns'))
32
+ # print(df.isnull().sum())
33
+ # print(df.dtypes)
34
+ return df
35
+
36
+ def get_data_one_map(address, is_LAT_LONG_only=False):
37
+ query_string = 'https://developers.onemap.sg/commonapi/search?searchVal='+str(address)+'&returnGeom=Y&getAddrDetails=Y'
38
+ resp = requests.get(query_string)
39
+ data = json.loads(resp.content) # convert from json to python dict
40
+ if data['found'] != 0 and is_LAT_LONG_only:
41
+ data = data['results'][0]
42
+ data = (data['LATITUDE'], data['LONGITUDE'])
43
+ elif data['found'] != 0:
44
+ data = data['results'][0]
45
+ else:
46
+ data = None
47
+ return data
48
+
49
+ def distance_to_city(address):
50
+ hdb_coordinates = get_data_one_map(address, is_LAT_LONG_only = True)
51
+ return geopy.distance.great_circle((1.29293672227779, 103.852585580366), hdb_coordinates).km
52
+
53
+ def distance_to_nearest_MRT_station(address):
54
+ hdb_coordinates = get_data_one_map(address, is_LAT_LONG_only = True)
55
+ df_MRT = pd.read_pickle('./df_MRT.pkl')
56
+ MRT_coordinates = df_MRT.T.iloc[-1,:].tolist()
57
+ dist = []
58
+ for coordinates in MRT_coordinates:
59
+ dist.append(geopy.distance.great_circle(hdb_coordinates, coordinates).km)
60
+ return min(dist)
61
+
62
+ def month_to_quarter(x):
63
+ year = int(x.split('-')[0])
64
+ month = int(x.split('-')[1])
65
+ if month <= 3:
66
+ month = '1Q'
67
+ elif month <= 6:
68
+ month = '2Q'
69
+ elif month <= 9:
70
+ month = '3Q'
71
+ else:
72
+ month = '4Q'
73
+ return (str(year) + '-' + str(month))
74
+
75
+ def get_update(data_year):
76
+ ### DATA EXTRACTION AND PREPROCESSING ###
77
+ df_raw_data = get_data_data_gov('https://data.gov.sg/api/action/datastore_search?resource_id=f1765b54-a209-4718-8d38-a39237f502b3&limit=1000000')
78
+ df_raw_data['address'] = df_raw_data['block'] + ' ' + df_raw_data['street_name']
79
+ df_raw_data['quarter'] = df_raw_data['month'].apply(month_to_quarter)
80
+ df_raw_data['year_sold'] = df_raw_data['month'].apply((lambda x: int(x.split('-')[0])))
81
+ df_raw_data['month_sold'] = df_raw_data['month'].apply((lambda x: int(x.split('-')[1])))
82
+ df_raw_data['remaining_lease_years'] = df_raw_data['remaining_lease'].apply(lambda x: int(x.split()[0]))
83
+ df_raw_data['floor_area_sqft'] = round((df_raw_data['floor_area_sqm'].astype(float))*10.764)
84
+
85
+ df_price_index = get_data_singstat_price_index('https://tablebuilder.singstat.gov.sg/api/table/tabledata/M212161?isTestApi=true')
86
+ df_price_index = df_price_index.rename(columns = {'key':'quarter', 'value': 'index'})
87
+ df_price_index['quarter'] = df_price_index['quarter'].apply(lambda x : x.replace(' ', '-'))
88
+
89
+ df_selected_year = df_raw_data[df_raw_data['year_sold']>=data_year]
90
+ quarter_list = list(df_price_index['quarter'])
91
+ df_selected_year['quarter'] = df_selected_year['quarter'].apply(lambda x : x if x in quarter_list else quarter_list[-1])
92
+ df_selected_year = pd.merge(df_selected_year, df_price_index, how='left', on='quarter')
93
+
94
+ # convert to float
95
+ df_selected_year['index'] = pd.to_numeric(df_selected_year['index'])
96
+ df_selected_year['resale_price'] = pd.to_numeric(df_selected_year['resale_price'])
97
+
98
+ # normalised to latest price index
99
+ df_selected_year['normalised_resale_price'] = round(df_selected_year['resale_price']*float(df_price_index.tail(1)['index'])/df_selected_year['index'],0)
100
+ df_selected_year['price_psf'] = round(df_selected_year['normalised_resale_price']/df_selected_year['floor_area_sqft'])
101
+
102
+ df_selected_year['storey_range'] = df_selected_year['storey_range'] \
103
+ .str.replace('01 TO 03', 'Low Floor') \
104
+ .str.replace('04 TO 06', 'Mid Floor') \
105
+ .str.replace('07 TO 09', 'Mid Floor') \
106
+ .str.replace('10 TO 12', 'High Floor') \
107
+ .str.replace('13 TO 15', 'High Floor') \
108
+ .str.replace('16 TO 18', 'High Floor') \
109
+ .str.replace('19 TO 21', 'High Floor') \
110
+ .str.replace('22 TO 24', 'High Floor') \
111
+ .str.replace('25 TO 27', 'High Floor') \
112
+ .str.replace('25 TO 27', 'High Floor') \
113
+ .str.replace('25 TO 27', 'High Floor') \
114
+ .str.replace('28 TO 30', 'High Floor') \
115
+ .str.replace('31 TO 33', 'High Floor') \
116
+ .str.replace('34 TO 36', 'High Floor') \
117
+ .str.replace('37 TO 39', 'High Floor') \
118
+ .str.replace('40 TO 42', 'High Floor') \
119
+ .str.replace('43 TO 45', 'High Floor') \
120
+ .str.replace('46 TO 48', 'High Floor') \
121
+ .str.replace('49 TO 51', 'High Floor')
122
+
123
+ df_selected_year = df_selected_year.drop(columns=['street_name', 'resale_price', 'remaining_lease', 'lease_commence_date', 'block'])
124
+ HDB_address_list = df_selected_year['address'].unique().tolist()
125
+
126
+ data_list = []
127
+
128
+ for i in range(0, len(HDB_address_list)):
129
+ data = get_data_one_map(HDB_address_list[i])
130
+ if data is not None:
131
+ data_list.append(data)
132
+
133
+ df_HDB = pd.DataFrame.from_dict(data_list)
134
+ #creating primary key for df_HDB for further table join
135
+ df_HDB['LAT_LONG']= (df_HDB['LATITUDE'] +' '+ df_HDB['LONGITUDE']).apply(lambda x: tuple(x.split(' ')))
136
+
137
+ tmp_df = pd.read_csv('./MRT_LRT_STATION.csv')
138
+ MRT_list = tmp_df['Station'].unique().tolist()
139
+ print('Number of MRT stations:', len(MRT_list))
140
+
141
+ data_list = []
142
+
143
+ for i in range(0, len(MRT_list)):
144
+ data = get_data_one_map(MRT_list[i])
145
+ if data is not None:
146
+ data_list.append(data)
147
+
148
+ df_MRT = pd.DataFrame.from_dict(data_list)
149
+ df_MRT['LAT_LONG'] = (df_MRT['LATITUDE'] +' '+ df_MRT['LONGITUDE']).apply(lambda x: tuple(x.split(' ')))
150
+ df_MRT.to_pickle('df_MRT.pkl')
151
+
152
+ MRT_coordinates = df_MRT.T.iloc[-1,:].tolist()
153
+ df_HDB_coordinates = pd.DataFrame(df_HDB['LAT_LONG'])
154
+
155
+ df_HDB_coordinates = pd.DataFrame(df_HDB['LAT_LONG'])
156
+ for coordinates in MRT_coordinates:
157
+ df_HDB_coordinates[coordinates]=df_HDB['LAT_LONG'].apply(lambda y: geopy.distance.great_circle(y, coordinates).km)
158
+
159
+ # get the distance from each address to the nearest station
160
+ df_HDB_coordinates['distance_to_nearest_MRT_station'] = df_HDB_coordinates.iloc[:,1:].apply(lambda x: min(x), axis=1)
161
+ df_HDB_coordinates_with_MRT_distance = df_HDB_coordinates.iloc[:,[0,-1]]
162
+
163
+ df_HDB = pd.merge(df_HDB, df_HDB_coordinates_with_MRT_distance, on='LAT_LONG', how='left')
164
+ df_HDB = df_HDB.drop(columns=['SEARCHVAL', 'BUILDING', 'ADDRESS' ,'X', 'Y', 'LONGTITUDE'])
165
+
166
+ # City Hall: 1.29293672227779 103.852585580366
167
+ df_HDB['distance_to_city'] = df_HDB['LAT_LONG'].apply(lambda x: geopy.distance.great_circle((1.29293672227779, 103.852585580366), x).km)
168
+
169
+ df_HDB['address'] = df_HDB ['BLK_NO'] + ' ' + df_HDB ['ROAD_NAME']
170
+
171
+ df_HDB['address'] = df_HDB['address'] \
172
+ .str.replace('AVENUE', 'AVE') \
173
+ .str.replace('CRESCENT', 'CRES') \
174
+ .str.replace('ROAD', 'RD') \
175
+ .str.replace('STREET', 'ST') \
176
+ .str.replace('CENTRAL', 'CTRL') \
177
+ .str.replace('HEIGHTS', 'HTS') \
178
+ .str.replace('TERRACE', 'TER') \
179
+ .str.replace('JALAN', 'JLN') \
180
+ .str.replace('DRIVE', 'DR') \
181
+ .str.replace('PLACE', 'PL') \
182
+ .str.replace('CLOSE', 'CL') \
183
+ .str.replace('PARK', 'PK') \
184
+ .str.replace('GARDENS', 'GDNS') \
185
+ .str.replace('NORTH', 'NTH') \
186
+ .str.replace('SOUTH', 'STH') \
187
+ .str.replace('BUKIT', 'BT') \
188
+ .str.replace('UPPER', 'UPP}') \
189
+ .str.replace('COMMONWEALTH', "C'WEALTH")
190
+
191
+ df_clean_data = pd.merge(df_selected_year, df_HDB, on='address', how='left')
192
+ df_clean_data = df_clean_data.dropna(subset=['LAT_LONG'])
193
+
194
+ ### FEATURE SELECTION ###
195
+ features = [
196
+ 'flat_type',
197
+ 'storey_range',
198
+ 'floor_area_sqft',
199
+ 'remaining_lease_years',
200
+ 'distance_to_nearest_MRT_station',
201
+ 'distance_to_city',
202
+ 'price_psf']
203
+
204
+ df = df_clean_data[features]
205
+ df = pd.get_dummies(df)
206
+
207
+ ### MODEL TRAINING AND TESTING ###
208
+ X = df.drop('price_psf', axis=1)
209
+ y = df['price_psf']
210
+
211
+ print('Average price_per_sqm:', y.mean())
212
+ print('Min price_per_sqm:', y.min())
213
+ print('Max price_per_sqm:', y.max())
214
+
215
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
216
+
217
+ model_RFR = make_pipeline(StandardScaler(),RandomForestRegressor())
218
+ model_RFR.fit(X_train, y_train) # apply scaling on training data
219
+ model_RFR.score(X_test, y_test)
220
+
221
+ print('Mean Absolute Error:', mean_absolute_error(y_test, model_RFR.predict(X_test)))
222
+ print('Mean Squared Error:', mean_squared_error(y_test, model_RFR.predict(X_test)))
223
+ print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, model_RFR.predict(X_test))))
224
+
225
+ f = open("model.log", "r")
226
+ data = f.readline()
227
+ model_score = float(data.split()[-1])
228
+ MSE = np.sqrt(mean_squared_error(y_test, model_RFR.predict(X_test)))
229
+
230
+ if mean_squared_error(y_test, model_RFR.predict(X_test)) > model_score:
231
+ pickle.dump(model_RFR, open('model.sav', 'wb'))
232
+ f = open("model.log", "w")
233
+ f.write(f'model_score = {MSE}')
234
+ f.close()