Spaces:
Runtime error
Runtime error
import pandas as pd | |
import numpy as np | |
from matplotlib import pyplot as plt | |
%matplotlib inline | |
import matplotlib | |
matplotlib.rcParams["figure.figsize"] = (20, 10) | |
path = '/content/bengaluru_house_prices.csv' | |
df = pd.read_csv(path) | |
df.head() | |
df.shape | |
df.groupby('area_type')['area_type'].agg('count') | |
df = df.drop(['area_type','society','balcony','availability'], axis = 'columns') | |
df.head() | |
df.isnull().sum() | |
df=df.dropna() | |
df.head() | |
df.shape | |
df.isnull().sum() | |
df['size'].unique() | |
df['BHK'] = df['size'].apply(lambda x: int(x.split(' ')[0])) | |
df.head() | |
df['BHK'].unique() | |
df['total_sqft'].unique() | |
def isfloat(x): | |
token = x.split('-') | |
if len(token)==2: | |
return (float(token[0])+float(token[1]))/2 | |
try: | |
return float(x) | |
except: | |
return None | |
isfloat('2100 - 2600') | |
df['total_sqft'] = df['total_sqft'].apply(isfloat) | |
df.head(31) | |
df=df.drop(['size'], axis = 'columns') | |
df.head(31) | |
df.dtypes | |
df['price_per_sqft'] = df['price']*100000/df['total_sqft'] | |
df.head() | |
len(df.location.unique()) | |
df.location = df.location.apply(lambda x: x.strip()) | |
loc_stats = df.groupby('location')['location'].agg('count').sort_values(ascending = False) | |
loc_stats | |
len(loc_stats[loc_stats <= 10]) | |
loc_stats_ten = loc_stats[loc_stats<=10] | |
loc_stats_ten | |
df.location = df.location.apply(lambda x: 'other' if x in loc_stats_ten else x) | |
len(df.location.unique()); | |
df.head(10) | |
df[df.total_sqft/df.BHK < 300].head() | |
df = df[~(df.total_sqft/df.BHK < 300)] | |
df.price_per_sqft.describe() | |
def rem_out(df): | |
df_out = pd.DataFrame() | |
for key, subdf in df.groupby('location'): | |
mu = np.mean(subdf.price_per_sqft) | |
std = np.std(subdf.price_per_sqft) | |
dft = subdf[(subdf.price_per_sqft > (mu-std)) & (subdf.price_per_sqft <= (mu+std))] | |
df_out = pd.concat([df_out, dft], ignore_index = True) | |
return df_out | |
df = rem_out(df); | |
df.shape | |
df.head() | |
def plot_scatter(df, location): | |
bhk2 = df[(df.location==location) & (df.BHK==2)] | |
bhk3 = df[(df.location==location) & (df.BHK==3)] | |
matplotlib.rcParams['figure.figsize'] = (15, 10) | |
plt.scatter(bhk2.total_sqft, bhk2.price, color = 'red', label = '2 BHK', s=50) | |
plt.scatter(bhk3.total_sqft, bhk3.price, color = 'blue', label = '3 BHK', s=50) | |
plt.xlabel('Total sq feet area') | |
plt.ylabel('price per sq feet area') | |
plt.legend() | |
plot_scatter(df, "Hebbal") | |
df.head() | |
def remove_outlier(df): | |
exclude = np.array([]) | |
for location, location_df in df.groupby('location'): | |
bhk_stat = {} | |
for BHK, bhk_df in location_df.groupby('BHK'): | |
bhk_stat[BHK] = { | |
'mean' : np.mean(bhk_df.price_per_sqft), | |
'std' : np.std(bhk_df.price_per_sqft), | |
'count' : bhk_df.shape[0] | |
} | |
# print(bhk_stat) | |
for BHK, bhk_df in location_df.groupby('BHK'): | |
stat = bhk_stat.get(BHK-1) | |
# print(stat) | |
if stat and stat['count']>5: | |
exclude = np.append(exclude, bhk_df[bhk_df.price_per_sqft<(stat['mean'])].index.values) | |
return df.drop(exclude, axis='index') | |
df = remove_outlier(df) | |
df.shape | |
plot_scatter(df, "Hebbal") | |
matplotlib.rcParams["figure.figsize"] = (20,10) | |
plt.hist(df.price_per_sqft, rwidth=0.8) | |
plt.xlabel("price per sq feet") | |
plt.ylabel("count") | |
df.bath.unique() | |
plt.hist(df.bath, rwidth = 0.5) | |
plt.xlabel('no. of bathrooms') | |
plt.ylabel('count') | |
df[df.bath > df.BHK+2] | |
df = df[df.bath < df.BHK+2] | |
df.shape | |
df = df.drop(['price_per_sqft'], axis = 'columns') | |
df.head(10) | |
dummies = pd.get_dummies(df.location) | |
dummies.head() | |
df = pd.concat([df, dummies.drop('other', axis = 'columns')], axis = 'columns') | |
df.head() | |
df = df.drop('location', axis = 'columns') | |
df.head() | |
df.shape | |
x = df.drop('price', axis = 'columns') | |
x.head() | |
y = df.price | |
y.head() | |
from sklearn.model_selection import train_test_split | |
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 10) | |
from sklearn.linear_model import LinearRegression | |
lr_clf = LinearRegression() | |
lr_clf.fit(X_train, y_train) | |
lr_clf.score(X_test, y_test) | |
from sklearn.model_selection import ShuffleSplit | |
from sklearn.model_selection import cross_val_score | |
cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 10) | |
cross_val_score(LinearRegression(), x, y, cv = cv) | |
from sklearn.model_selection import GridSearchCV | |
from sklearn.linear_model import Lasso | |
from sklearn.tree import DecisionTreeRegressor | |
def find_best_model(x, y): | |
algos = { | |
'linear_reg' : { | |
'model' : LinearRegression(), | |
'params' : { | |
'fit_intercept': [True, False], | |
'copy_X': [True, False], | |
'n_jobs': [None, -1], | |
'positive': [True, False] | |
} | |
}, | |
'lasso' : { | |
'model' : Lasso(), | |
'params' : { | |
'alpha' : [1,2], | |
'selection' : ['random', 'cyclic'] | |
} | |
}, | |
'dec_tree' : { | |
'model' : DecisionTreeRegressor(), | |
'params' : { | |
'criterion': ['friedman_mse', 'squared_error', 'poisson', 'absolute_error'], | |
'splitter': ['best', 'random'], | |
} | |
} | |
} | |
scores = [] | |
cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 10) | |
for algo_name, config in algos.items(): | |
gs = GridSearchCV(config['model'], config['params'], cv = cv, return_train_score = False) | |
gs.fit(x,y); | |
scores.append({ | |
'model' : algo_name, | |
'best_score' : gs.best_score_, | |
'best_params' : gs.best_params_ | |
}) | |
return pd.DataFrame(scores, columns = ['model', 'best_score', 'best_params']) | |
find_best_model(x,y) | |
def predict_price_func(location, sqft, bath, bhk): | |
loc_index = np.where(x.columns == location)[0][0] | |
xdash = np.zeros(len(x.columns)) | |
xdash[0] = sqft | |
xdash[1] = bath | |
xdash[2] = bhk | |
if loc_index >= 0: | |
xdash[loc_index] = 1 | |
return lr_clf.predict([xdash])[0] | |
df.head() | |
print(x.columns) | |
predict_price_func('1st Phase JP Nagar', 1200, 2, 2) | |
predict_price_func('Indira Nagar', 1200, 3, 3) | |
predict_price_func('Indira Nagar', 1200, 1, 3) | |
predict_price_func('Indira Nagar', 1200, 3, 4) | |
!pip install gradio | |
import gradio as gr | |
from gradio.components import Textbox, Number | |
interface = gr.Interface( | |
fn=predict_price_func, | |
inputs=[ | |
gr.inputs.Textbox(), # For location (text) | |
gr.inputs.Number(), # For area (numeric) | |
gr.inputs.Number(), # For bedrooms (numeric) | |
gr.inputs.Number() # For bathrooms (numeric) | |
], | |
outputs="text", | |
theme="huggingface" | |
) | |
interface.launch() |