import pandas as pd import numpy as np from matplotlib import pyplot as plt import matplotlib matplotlib.rcParams["figure.figsize"] = (20, 10) path = '/content/bengaluru_house_prices.csv' df = pd.read_csv(path) df.head() df.shape df.groupby('area_type')['area_type'].agg('count') df = df.drop(['area_type','society','balcony','availability'], axis = 'columns') df.head() df.isnull().sum() df=df.dropna() df.head() df.shape df.isnull().sum() df['size'].unique() df['BHK'] = df['size'].apply(lambda x: int(x.split(' ')[0])) df.head() df['BHK'].unique() df['total_sqft'].unique() def isfloat(x): token = x.split('-') if len(token)==2: return (float(token[0])+float(token[1]))/2 try: return float(x) except: return None isfloat('2100 - 2600') df['total_sqft'] = df['total_sqft'].apply(isfloat) df.head(31) df=df.drop(['size'], axis = 'columns') df.head(31) df.dtypes df['price_per_sqft'] = df['price']*100000/df['total_sqft'] df.head() len(df.location.unique()) df.location = df.location.apply(lambda x: x.strip()) loc_stats = df.groupby('location')['location'].agg('count').sort_values(ascending = False) loc_stats len(loc_stats[loc_stats <= 10]) loc_stats_ten = loc_stats[loc_stats<=10] loc_stats_ten df.location = df.location.apply(lambda x: 'other' if x in loc_stats_ten else x) len(df.location.unique()); df.head(10) df[df.total_sqft/df.BHK < 300].head() df = df[~(df.total_sqft/df.BHK < 300)] df.price_per_sqft.describe() def rem_out(df): df_out = pd.DataFrame() for key, subdf in df.groupby('location'): mu = np.mean(subdf.price_per_sqft) std = np.std(subdf.price_per_sqft) dft = subdf[(subdf.price_per_sqft > (mu-std)) & (subdf.price_per_sqft <= (mu+std))] df_out = pd.concat([df_out, dft], ignore_index = True) return df_out df = rem_out(df); df.shape df.head() def plot_scatter(df, location): bhk2 = df[(df.location==location) & (df.BHK==2)] bhk3 = df[(df.location==location) & (df.BHK==3)] matplotlib.rcParams['figure.figsize'] = (15, 10) plt.scatter(bhk2.total_sqft, bhk2.price, color = 'red', label = '2 BHK', s=50) plt.scatter(bhk3.total_sqft, bhk3.price, color = 'blue', label = '3 BHK', s=50) plt.xlabel('Total sq feet area') plt.ylabel('price per sq feet area') plt.legend() plot_scatter(df, "Hebbal") df.head() def remove_outlier(df): exclude = np.array([]) for location, location_df in df.groupby('location'): bhk_stat = {} for BHK, bhk_df in location_df.groupby('BHK'): bhk_stat[BHK] = { 'mean' : np.mean(bhk_df.price_per_sqft), 'std' : np.std(bhk_df.price_per_sqft), 'count' : bhk_df.shape[0] } # print(bhk_stat) for BHK, bhk_df in location_df.groupby('BHK'): stat = bhk_stat.get(BHK-1) # print(stat) if stat and stat['count']>5: exclude = np.append(exclude, bhk_df[bhk_df.price_per_sqft<(stat['mean'])].index.values) return df.drop(exclude, axis='index') df = remove_outlier(df) df.shape plot_scatter(df, "Hebbal") matplotlib.rcParams["figure.figsize"] = (20,10) plt.hist(df.price_per_sqft, rwidth=0.8) plt.xlabel("price per sq feet") plt.ylabel("count") df.bath.unique() plt.hist(df.bath, rwidth = 0.5) plt.xlabel('no. of bathrooms') plt.ylabel('count') df[df.bath > df.BHK+2] df = df[df.bath < df.BHK+2] df.shape df = df.drop(['price_per_sqft'], axis = 'columns') df.head(10) dummies = pd.get_dummies(df.location) dummies.head() df = pd.concat([df, dummies.drop('other', axis = 'columns')], axis = 'columns') df.head() df = df.drop('location', axis = 'columns') df.head() df.shape x = df.drop('price', axis = 'columns') x.head() y = df.price y.head() from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 10) from sklearn.linear_model import LinearRegression lr_clf = LinearRegression() lr_clf.fit(X_train, y_train) lr_clf.score(X_test, y_test) from sklearn.model_selection import ShuffleSplit from sklearn.model_selection import cross_val_score cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 10) cross_val_score(LinearRegression(), x, y, cv = cv) from sklearn.model_selection import GridSearchCV from sklearn.linear_model import Lasso from sklearn.tree import DecisionTreeRegressor def find_best_model(x, y): algos = { 'linear_reg' : { 'model' : LinearRegression(), 'params' : { 'fit_intercept': [True, False], 'copy_X': [True, False], 'n_jobs': [None, -1], 'positive': [True, False] } }, 'lasso' : { 'model' : Lasso(), 'params' : { 'alpha' : [1,2], 'selection' : ['random', 'cyclic'] } }, 'dec_tree' : { 'model' : DecisionTreeRegressor(), 'params' : { 'criterion': ['friedman_mse', 'squared_error', 'poisson', 'absolute_error'], 'splitter': ['best', 'random'], } } } scores = [] cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 10) for algo_name, config in algos.items(): gs = GridSearchCV(config['model'], config['params'], cv = cv, return_train_score = False) gs.fit(x,y); scores.append({ 'model' : algo_name, 'best_score' : gs.best_score_, 'best_params' : gs.best_params_ }) return pd.DataFrame(scores, columns = ['model', 'best_score', 'best_params']) find_best_model(x,y) def predict_price_func(location, sqft, bath, bhk): loc_index = np.where(x.columns == location)[0][0] xdash = np.zeros(len(x.columns)) xdash[0] = sqft xdash[1] = bath xdash[2] = bhk if loc_index >= 0: xdash[loc_index] = 1 return lr_clf.predict([xdash])[0] df.head() print(x.columns) predict_price_func('1st Phase JP Nagar', 1200, 2, 2) predict_price_func('Indira Nagar', 1200, 3, 3) predict_price_func('Indira Nagar', 1200, 1, 3) predict_price_func('Indira Nagar', 1200, 3, 4) !pip install gradio import gradio as gr from gradio.components import Textbox, Number interface = gr.Interface( fn=predict_price_func, inputs=[ gr.inputs.Textbox(), # For location (text) gr.inputs.Number(), # For area (numeric) gr.inputs.Number(), # For bedrooms (numeric) gr.inputs.Number() # For bathrooms (numeric) ], outputs="text", theme="huggingface" ) interface.launch()