Aashiue commited on
Commit
56ead55
1 Parent(s): a934882

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +272 -0
app.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from matplotlib import pyplot as plt
4
+ %matplotlib inline
5
+ import matplotlib
6
+ matplotlib.rcParams["figure.figsize"] = (20, 10)
7
+
8
+ path = '/content/bengaluru_house_prices.csv'
9
+ df = pd.read_csv(path)
10
+ df.head()
11
+
12
+ df.shape
13
+
14
+ df.groupby('area_type')['area_type'].agg('count')
15
+
16
+ df = df.drop(['area_type','society','balcony','availability'], axis = 'columns')
17
+ df.head()
18
+
19
+ df.isnull().sum()
20
+
21
+ df=df.dropna()
22
+ df.head()
23
+
24
+ df.shape
25
+ df.isnull().sum()
26
+
27
+ df['size'].unique()
28
+
29
+ df['BHK'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
30
+
31
+ df.head()
32
+
33
+ df['BHK'].unique()
34
+
35
+ df['total_sqft'].unique()
36
+
37
+ def isfloat(x):
38
+ token = x.split('-')
39
+ if len(token)==2:
40
+ return (float(token[0])+float(token[1]))/2
41
+ try:
42
+ return float(x)
43
+ except:
44
+ return None
45
+
46
+ isfloat('2100 - 2600')
47
+
48
+ df['total_sqft'] = df['total_sqft'].apply(isfloat)
49
+ df.head(31)
50
+
51
+ df=df.drop(['size'], axis = 'columns')
52
+
53
+ df.head(31)
54
+
55
+ df.dtypes
56
+
57
+ df['price_per_sqft'] = df['price']*100000/df['total_sqft']
58
+ df.head()
59
+
60
+ len(df.location.unique())
61
+
62
+ df.location = df.location.apply(lambda x: x.strip())
63
+ loc_stats = df.groupby('location')['location'].agg('count').sort_values(ascending = False)
64
+ loc_stats
65
+
66
+ len(loc_stats[loc_stats <= 10])
67
+
68
+ loc_stats_ten = loc_stats[loc_stats<=10]
69
+ loc_stats_ten
70
+
71
+ df.location = df.location.apply(lambda x: 'other' if x in loc_stats_ten else x)
72
+
73
+
74
+ len(df.location.unique());
75
+
76
+ df.head(10)
77
+
78
+ df[df.total_sqft/df.BHK < 300].head()
79
+
80
+ df = df[~(df.total_sqft/df.BHK < 300)]
81
+
82
+ df.price_per_sqft.describe()
83
+
84
+ def rem_out(df):
85
+ df_out = pd.DataFrame()
86
+ for key, subdf in df.groupby('location'):
87
+ mu = np.mean(subdf.price_per_sqft)
88
+ std = np.std(subdf.price_per_sqft)
89
+ dft = subdf[(subdf.price_per_sqft > (mu-std)) & (subdf.price_per_sqft <= (mu+std))]
90
+ df_out = pd.concat([df_out, dft], ignore_index = True)
91
+ return df_out
92
+
93
+ df = rem_out(df);
94
+ df.shape
95
+ df.head()
96
+
97
+ def plot_scatter(df, location):
98
+ bhk2 = df[(df.location==location) & (df.BHK==2)]
99
+ bhk3 = df[(df.location==location) & (df.BHK==3)]
100
+ matplotlib.rcParams['figure.figsize'] = (15, 10)
101
+ plt.scatter(bhk2.total_sqft, bhk2.price, color = 'red', label = '2 BHK', s=50)
102
+ plt.scatter(bhk3.total_sqft, bhk3.price, color = 'blue', label = '3 BHK', s=50)
103
+ plt.xlabel('Total sq feet area')
104
+ plt.ylabel('price per sq feet area')
105
+ plt.legend()
106
+
107
+ plot_scatter(df, "Hebbal")
108
+
109
+ df.head()
110
+
111
+ def remove_outlier(df):
112
+ exclude = np.array([])
113
+ for location, location_df in df.groupby('location'):
114
+ bhk_stat = {}
115
+ for BHK, bhk_df in location_df.groupby('BHK'):
116
+ bhk_stat[BHK] = {
117
+ 'mean' : np.mean(bhk_df.price_per_sqft),
118
+ 'std' : np.std(bhk_df.price_per_sqft),
119
+ 'count' : bhk_df.shape[0]
120
+ }
121
+ # print(bhk_stat)
122
+ for BHK, bhk_df in location_df.groupby('BHK'):
123
+ stat = bhk_stat.get(BHK-1)
124
+ # print(stat)
125
+ if stat and stat['count']>5:
126
+ exclude = np.append(exclude, bhk_df[bhk_df.price_per_sqft<(stat['mean'])].index.values)
127
+ return df.drop(exclude, axis='index')
128
+
129
+ df = remove_outlier(df)
130
+ df.shape
131
+
132
+ plot_scatter(df, "Hebbal")
133
+
134
+ matplotlib.rcParams["figure.figsize"] = (20,10)
135
+ plt.hist(df.price_per_sqft, rwidth=0.8)
136
+ plt.xlabel("price per sq feet")
137
+ plt.ylabel("count")
138
+
139
+ df.bath.unique()
140
+
141
+ plt.hist(df.bath, rwidth = 0.5)
142
+ plt.xlabel('no. of bathrooms')
143
+ plt.ylabel('count')
144
+
145
+ df[df.bath > df.BHK+2]
146
+
147
+ df = df[df.bath < df.BHK+2]
148
+ df.shape
149
+
150
+ df = df.drop(['price_per_sqft'], axis = 'columns')
151
+ df.head(10)
152
+
153
+ dummies = pd.get_dummies(df.location)
154
+ dummies.head()
155
+
156
+ df = pd.concat([df, dummies.drop('other', axis = 'columns')], axis = 'columns')
157
+ df.head()
158
+
159
+ df = df.drop('location', axis = 'columns')
160
+ df.head()
161
+
162
+ df.shape
163
+
164
+ x = df.drop('price', axis = 'columns')
165
+ x.head()
166
+
167
+ y = df.price
168
+ y.head()
169
+
170
+ from sklearn.model_selection import train_test_split
171
+ X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 10)
172
+
173
+ from sklearn.linear_model import LinearRegression
174
+ lr_clf = LinearRegression()
175
+ lr_clf.fit(X_train, y_train)
176
+ lr_clf.score(X_test, y_test)
177
+
178
+ from sklearn.model_selection import ShuffleSplit
179
+ from sklearn.model_selection import cross_val_score
180
+
181
+ cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 10)
182
+
183
+ cross_val_score(LinearRegression(), x, y, cv = cv)
184
+
185
+ from sklearn.model_selection import GridSearchCV
186
+
187
+ from sklearn.linear_model import Lasso
188
+ from sklearn.tree import DecisionTreeRegressor
189
+
190
+ def find_best_model(x, y):
191
+ algos = {
192
+ 'linear_reg' : {
193
+ 'model' : LinearRegression(),
194
+ 'params' : {
195
+ 'fit_intercept': [True, False],
196
+ 'copy_X': [True, False],
197
+ 'n_jobs': [None, -1],
198
+ 'positive': [True, False]
199
+ }
200
+ },
201
+ 'lasso' : {
202
+ 'model' : Lasso(),
203
+ 'params' : {
204
+ 'alpha' : [1,2],
205
+ 'selection' : ['random', 'cyclic']
206
+ }
207
+ },
208
+ 'dec_tree' : {
209
+ 'model' : DecisionTreeRegressor(),
210
+ 'params' : {
211
+ 'criterion': ['friedman_mse', 'squared_error', 'poisson', 'absolute_error'],
212
+ 'splitter': ['best', 'random'],
213
+ }
214
+ }
215
+ }
216
+ scores = []
217
+ cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 10)
218
+ for algo_name, config in algos.items():
219
+ gs = GridSearchCV(config['model'], config['params'], cv = cv, return_train_score = False)
220
+ gs.fit(x,y);
221
+ scores.append({
222
+ 'model' : algo_name,
223
+ 'best_score' : gs.best_score_,
224
+ 'best_params' : gs.best_params_
225
+ })
226
+ return pd.DataFrame(scores, columns = ['model', 'best_score', 'best_params'])
227
+
228
+ find_best_model(x,y)
229
+
230
+ def predict_price_func(location, sqft, bath, bhk):
231
+ loc_index = np.where(x.columns == location)[0][0]
232
+
233
+ xdash = np.zeros(len(x.columns))
234
+ xdash[0] = sqft
235
+ xdash[1] = bath
236
+ xdash[2] = bhk
237
+
238
+ if loc_index >= 0:
239
+ xdash[loc_index] = 1
240
+
241
+ return lr_clf.predict([xdash])[0]
242
+
243
+ df.head()
244
+
245
+ print(x.columns)
246
+
247
+ predict_price_func('1st Phase JP Nagar', 1200, 2, 2)
248
+
249
+ predict_price_func('Indira Nagar', 1200, 3, 3)
250
+
251
+ predict_price_func('Indira Nagar', 1200, 1, 3)
252
+
253
+ predict_price_func('Indira Nagar', 1200, 3, 4)
254
+
255
+ !pip install gradio
256
+ import gradio as gr
257
+
258
+ from gradio.components import Textbox, Number
259
+
260
+ interface = gr.Interface(
261
+ fn=predict_price_func,
262
+ inputs=[
263
+ gr.inputs.Textbox(), # For location (text)
264
+ gr.inputs.Number(), # For area (numeric)
265
+ gr.inputs.Number(), # For bedrooms (numeric)
266
+ gr.inputs.Number() # For bathrooms (numeric)
267
+ ],
268
+ outputs="text",
269
+ theme="huggingface"
270
+ )
271
+
272
+ interface.launch()