Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from matplotlib import pyplot as plt
|
4 |
+
%matplotlib inline
|
5 |
+
import matplotlib
|
6 |
+
matplotlib.rcParams["figure.figsize"] = (20, 10)
|
7 |
+
|
8 |
+
path = '/content/bengaluru_house_prices.csv'
|
9 |
+
df = pd.read_csv(path)
|
10 |
+
df.head()
|
11 |
+
|
12 |
+
df.shape
|
13 |
+
|
14 |
+
df.groupby('area_type')['area_type'].agg('count')
|
15 |
+
|
16 |
+
df = df.drop(['area_type','society','balcony','availability'], axis = 'columns')
|
17 |
+
df.head()
|
18 |
+
|
19 |
+
df.isnull().sum()
|
20 |
+
|
21 |
+
df=df.dropna()
|
22 |
+
df.head()
|
23 |
+
|
24 |
+
df.shape
|
25 |
+
df.isnull().sum()
|
26 |
+
|
27 |
+
df['size'].unique()
|
28 |
+
|
29 |
+
df['BHK'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
|
30 |
+
|
31 |
+
df.head()
|
32 |
+
|
33 |
+
df['BHK'].unique()
|
34 |
+
|
35 |
+
df['total_sqft'].unique()
|
36 |
+
|
37 |
+
def isfloat(x):
|
38 |
+
token = x.split('-')
|
39 |
+
if len(token)==2:
|
40 |
+
return (float(token[0])+float(token[1]))/2
|
41 |
+
try:
|
42 |
+
return float(x)
|
43 |
+
except:
|
44 |
+
return None
|
45 |
+
|
46 |
+
isfloat('2100 - 2600')
|
47 |
+
|
48 |
+
df['total_sqft'] = df['total_sqft'].apply(isfloat)
|
49 |
+
df.head(31)
|
50 |
+
|
51 |
+
df=df.drop(['size'], axis = 'columns')
|
52 |
+
|
53 |
+
df.head(31)
|
54 |
+
|
55 |
+
df.dtypes
|
56 |
+
|
57 |
+
df['price_per_sqft'] = df['price']*100000/df['total_sqft']
|
58 |
+
df.head()
|
59 |
+
|
60 |
+
len(df.location.unique())
|
61 |
+
|
62 |
+
df.location = df.location.apply(lambda x: x.strip())
|
63 |
+
loc_stats = df.groupby('location')['location'].agg('count').sort_values(ascending = False)
|
64 |
+
loc_stats
|
65 |
+
|
66 |
+
len(loc_stats[loc_stats <= 10])
|
67 |
+
|
68 |
+
loc_stats_ten = loc_stats[loc_stats<=10]
|
69 |
+
loc_stats_ten
|
70 |
+
|
71 |
+
df.location = df.location.apply(lambda x: 'other' if x in loc_stats_ten else x)
|
72 |
+
|
73 |
+
|
74 |
+
len(df.location.unique());
|
75 |
+
|
76 |
+
df.head(10)
|
77 |
+
|
78 |
+
df[df.total_sqft/df.BHK < 300].head()
|
79 |
+
|
80 |
+
df = df[~(df.total_sqft/df.BHK < 300)]
|
81 |
+
|
82 |
+
df.price_per_sqft.describe()
|
83 |
+
|
84 |
+
def rem_out(df):
|
85 |
+
df_out = pd.DataFrame()
|
86 |
+
for key, subdf in df.groupby('location'):
|
87 |
+
mu = np.mean(subdf.price_per_sqft)
|
88 |
+
std = np.std(subdf.price_per_sqft)
|
89 |
+
dft = subdf[(subdf.price_per_sqft > (mu-std)) & (subdf.price_per_sqft <= (mu+std))]
|
90 |
+
df_out = pd.concat([df_out, dft], ignore_index = True)
|
91 |
+
return df_out
|
92 |
+
|
93 |
+
df = rem_out(df);
|
94 |
+
df.shape
|
95 |
+
df.head()
|
96 |
+
|
97 |
+
def plot_scatter(df, location):
|
98 |
+
bhk2 = df[(df.location==location) & (df.BHK==2)]
|
99 |
+
bhk3 = df[(df.location==location) & (df.BHK==3)]
|
100 |
+
matplotlib.rcParams['figure.figsize'] = (15, 10)
|
101 |
+
plt.scatter(bhk2.total_sqft, bhk2.price, color = 'red', label = '2 BHK', s=50)
|
102 |
+
plt.scatter(bhk3.total_sqft, bhk3.price, color = 'blue', label = '3 BHK', s=50)
|
103 |
+
plt.xlabel('Total sq feet area')
|
104 |
+
plt.ylabel('price per sq feet area')
|
105 |
+
plt.legend()
|
106 |
+
|
107 |
+
plot_scatter(df, "Hebbal")
|
108 |
+
|
109 |
+
df.head()
|
110 |
+
|
111 |
+
def remove_outlier(df):
|
112 |
+
exclude = np.array([])
|
113 |
+
for location, location_df in df.groupby('location'):
|
114 |
+
bhk_stat = {}
|
115 |
+
for BHK, bhk_df in location_df.groupby('BHK'):
|
116 |
+
bhk_stat[BHK] = {
|
117 |
+
'mean' : np.mean(bhk_df.price_per_sqft),
|
118 |
+
'std' : np.std(bhk_df.price_per_sqft),
|
119 |
+
'count' : bhk_df.shape[0]
|
120 |
+
}
|
121 |
+
# print(bhk_stat)
|
122 |
+
for BHK, bhk_df in location_df.groupby('BHK'):
|
123 |
+
stat = bhk_stat.get(BHK-1)
|
124 |
+
# print(stat)
|
125 |
+
if stat and stat['count']>5:
|
126 |
+
exclude = np.append(exclude, bhk_df[bhk_df.price_per_sqft<(stat['mean'])].index.values)
|
127 |
+
return df.drop(exclude, axis='index')
|
128 |
+
|
129 |
+
df = remove_outlier(df)
|
130 |
+
df.shape
|
131 |
+
|
132 |
+
plot_scatter(df, "Hebbal")
|
133 |
+
|
134 |
+
matplotlib.rcParams["figure.figsize"] = (20,10)
|
135 |
+
plt.hist(df.price_per_sqft, rwidth=0.8)
|
136 |
+
plt.xlabel("price per sq feet")
|
137 |
+
plt.ylabel("count")
|
138 |
+
|
139 |
+
df.bath.unique()
|
140 |
+
|
141 |
+
plt.hist(df.bath, rwidth = 0.5)
|
142 |
+
plt.xlabel('no. of bathrooms')
|
143 |
+
plt.ylabel('count')
|
144 |
+
|
145 |
+
df[df.bath > df.BHK+2]
|
146 |
+
|
147 |
+
df = df[df.bath < df.BHK+2]
|
148 |
+
df.shape
|
149 |
+
|
150 |
+
df = df.drop(['price_per_sqft'], axis = 'columns')
|
151 |
+
df.head(10)
|
152 |
+
|
153 |
+
dummies = pd.get_dummies(df.location)
|
154 |
+
dummies.head()
|
155 |
+
|
156 |
+
df = pd.concat([df, dummies.drop('other', axis = 'columns')], axis = 'columns')
|
157 |
+
df.head()
|
158 |
+
|
159 |
+
df = df.drop('location', axis = 'columns')
|
160 |
+
df.head()
|
161 |
+
|
162 |
+
df.shape
|
163 |
+
|
164 |
+
x = df.drop('price', axis = 'columns')
|
165 |
+
x.head()
|
166 |
+
|
167 |
+
y = df.price
|
168 |
+
y.head()
|
169 |
+
|
170 |
+
from sklearn.model_selection import train_test_split
|
171 |
+
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 10)
|
172 |
+
|
173 |
+
from sklearn.linear_model import LinearRegression
|
174 |
+
lr_clf = LinearRegression()
|
175 |
+
lr_clf.fit(X_train, y_train)
|
176 |
+
lr_clf.score(X_test, y_test)
|
177 |
+
|
178 |
+
from sklearn.model_selection import ShuffleSplit
|
179 |
+
from sklearn.model_selection import cross_val_score
|
180 |
+
|
181 |
+
cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 10)
|
182 |
+
|
183 |
+
cross_val_score(LinearRegression(), x, y, cv = cv)
|
184 |
+
|
185 |
+
from sklearn.model_selection import GridSearchCV
|
186 |
+
|
187 |
+
from sklearn.linear_model import Lasso
|
188 |
+
from sklearn.tree import DecisionTreeRegressor
|
189 |
+
|
190 |
+
def find_best_model(x, y):
|
191 |
+
algos = {
|
192 |
+
'linear_reg' : {
|
193 |
+
'model' : LinearRegression(),
|
194 |
+
'params' : {
|
195 |
+
'fit_intercept': [True, False],
|
196 |
+
'copy_X': [True, False],
|
197 |
+
'n_jobs': [None, -1],
|
198 |
+
'positive': [True, False]
|
199 |
+
}
|
200 |
+
},
|
201 |
+
'lasso' : {
|
202 |
+
'model' : Lasso(),
|
203 |
+
'params' : {
|
204 |
+
'alpha' : [1,2],
|
205 |
+
'selection' : ['random', 'cyclic']
|
206 |
+
}
|
207 |
+
},
|
208 |
+
'dec_tree' : {
|
209 |
+
'model' : DecisionTreeRegressor(),
|
210 |
+
'params' : {
|
211 |
+
'criterion': ['friedman_mse', 'squared_error', 'poisson', 'absolute_error'],
|
212 |
+
'splitter': ['best', 'random'],
|
213 |
+
}
|
214 |
+
}
|
215 |
+
}
|
216 |
+
scores = []
|
217 |
+
cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 10)
|
218 |
+
for algo_name, config in algos.items():
|
219 |
+
gs = GridSearchCV(config['model'], config['params'], cv = cv, return_train_score = False)
|
220 |
+
gs.fit(x,y);
|
221 |
+
scores.append({
|
222 |
+
'model' : algo_name,
|
223 |
+
'best_score' : gs.best_score_,
|
224 |
+
'best_params' : gs.best_params_
|
225 |
+
})
|
226 |
+
return pd.DataFrame(scores, columns = ['model', 'best_score', 'best_params'])
|
227 |
+
|
228 |
+
find_best_model(x,y)
|
229 |
+
|
230 |
+
def predict_price_func(location, sqft, bath, bhk):
|
231 |
+
loc_index = np.where(x.columns == location)[0][0]
|
232 |
+
|
233 |
+
xdash = np.zeros(len(x.columns))
|
234 |
+
xdash[0] = sqft
|
235 |
+
xdash[1] = bath
|
236 |
+
xdash[2] = bhk
|
237 |
+
|
238 |
+
if loc_index >= 0:
|
239 |
+
xdash[loc_index] = 1
|
240 |
+
|
241 |
+
return lr_clf.predict([xdash])[0]
|
242 |
+
|
243 |
+
df.head()
|
244 |
+
|
245 |
+
print(x.columns)
|
246 |
+
|
247 |
+
predict_price_func('1st Phase JP Nagar', 1200, 2, 2)
|
248 |
+
|
249 |
+
predict_price_func('Indira Nagar', 1200, 3, 3)
|
250 |
+
|
251 |
+
predict_price_func('Indira Nagar', 1200, 1, 3)
|
252 |
+
|
253 |
+
predict_price_func('Indira Nagar', 1200, 3, 4)
|
254 |
+
|
255 |
+
!pip install gradio
|
256 |
+
import gradio as gr
|
257 |
+
|
258 |
+
from gradio.components import Textbox, Number
|
259 |
+
|
260 |
+
interface = gr.Interface(
|
261 |
+
fn=predict_price_func,
|
262 |
+
inputs=[
|
263 |
+
gr.inputs.Textbox(), # For location (text)
|
264 |
+
gr.inputs.Number(), # For area (numeric)
|
265 |
+
gr.inputs.Number(), # For bedrooms (numeric)
|
266 |
+
gr.inputs.Number() # For bathrooms (numeric)
|
267 |
+
],
|
268 |
+
outputs="text",
|
269 |
+
theme="huggingface"
|
270 |
+
)
|
271 |
+
|
272 |
+
interface.launch()
|