{
"cells": [
{
"cell_type": "code",
"execution_count": 89,
"id": "2668805c",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"import matplotlib\n",
"import warnings\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "2566dffa",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" area_type | \n",
" availability | \n",
" location | \n",
" size | \n",
" society | \n",
" total_sqft | \n",
" bath | \n",
" balcony | \n",
" price | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Super built-up Area | \n",
" 19-Dec | \n",
" Electronic City Phase II | \n",
" 2 BHK | \n",
" Coomee | \n",
" 1056 | \n",
" 2.0 | \n",
" 1.0 | \n",
" 39.07 | \n",
"
\n",
" \n",
" 1 | \n",
" Plot Area | \n",
" Ready To Move | \n",
" Chikka Tirupathi | \n",
" 4 Bedroom | \n",
" Theanmp | \n",
" 2600 | \n",
" 5.0 | \n",
" 3.0 | \n",
" 120.00 | \n",
"
\n",
" \n",
" 2 | \n",
" Built-up Area | \n",
" Ready To Move | \n",
" Uttarahalli | \n",
" 3 BHK | \n",
" NaN | \n",
" 1440 | \n",
" 2.0 | \n",
" 3.0 | \n",
" 62.00 | \n",
"
\n",
" \n",
" 3 | \n",
" Super built-up Area | \n",
" Ready To Move | \n",
" Lingadheeranahalli | \n",
" 3 BHK | \n",
" Soiewre | \n",
" 1521 | \n",
" 3.0 | \n",
" 1.0 | \n",
" 95.00 | \n",
"
\n",
" \n",
" 4 | \n",
" Super built-up Area | \n",
" Ready To Move | \n",
" Kothanur | \n",
" 2 BHK | \n",
" NaN | \n",
" 1200 | \n",
" 2.0 | \n",
" 1.0 | \n",
" 51.00 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" area_type availability location size \\\n",
"0 Super built-up Area 19-Dec Electronic City Phase II 2 BHK \n",
"1 Plot Area Ready To Move Chikka Tirupathi 4 Bedroom \n",
"2 Built-up Area Ready To Move Uttarahalli 3 BHK \n",
"3 Super built-up Area Ready To Move Lingadheeranahalli 3 BHK \n",
"4 Super built-up Area Ready To Move Kothanur 2 BHK \n",
"\n",
" society total_sqft bath balcony price \n",
"0 Coomee 1056 2.0 1.0 39.07 \n",
"1 Theanmp 2600 5.0 3.0 120.00 \n",
"2 NaN 1440 2.0 3.0 62.00 \n",
"3 Soiewre 1521 3.0 1.0 95.00 \n",
"4 NaN 1200 2.0 1.0 51.00 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1=pd.read_csv('bengaluru_house_prices.csv')\n",
"df1.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "141f3c7b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(13320, 9)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "a666c8e7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['area_type', 'availability', 'location', 'size', 'society',\n",
" 'total_sqft', 'bath', 'balcony', 'price'],\n",
" dtype='object')"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1.columns"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "c4b78e77",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Super built-up Area', 'Plot Area', 'Built-up Area',\n",
" 'Carpet Area'], dtype=object)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1['area_type'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d2b1286a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Super built-up Area 8790\n",
"Built-up Area 2418\n",
"Plot Area 2025\n",
"Carpet Area 87\n",
"Name: area_type, dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1['area_type'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "4776cabc",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" location | \n",
" size | \n",
" total_sqft | \n",
" bath | \n",
" price | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Electronic City Phase II | \n",
" 2 BHK | \n",
" 1056 | \n",
" 2.0 | \n",
" 39.07 | \n",
"
\n",
" \n",
" 1 | \n",
" Chikka Tirupathi | \n",
" 4 Bedroom | \n",
" 2600 | \n",
" 5.0 | \n",
" 120.00 | \n",
"
\n",
" \n",
" 2 | \n",
" Uttarahalli | \n",
" 3 BHK | \n",
" 1440 | \n",
" 2.0 | \n",
" 62.00 | \n",
"
\n",
" \n",
" 3 | \n",
" Lingadheeranahalli | \n",
" 3 BHK | \n",
" 1521 | \n",
" 3.0 | \n",
" 95.00 | \n",
"
\n",
" \n",
" 4 | \n",
" Kothanur | \n",
" 2 BHK | \n",
" 1200 | \n",
" 2.0 | \n",
" 51.00 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" location size total_sqft bath price\n",
"0 Electronic City Phase II 2 BHK 1056 2.0 39.07\n",
"1 Chikka Tirupathi 4 Bedroom 2600 5.0 120.00\n",
"2 Uttarahalli 3 BHK 1440 2.0 62.00\n",
"3 Lingadheeranahalli 3 BHK 1521 3.0 95.00\n",
"4 Kothanur 2 BHK 1200 2.0 51.00"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2=df1.drop(['area_type','availability','society','balcony'],axis='columns')\n",
"df2.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "b971be8c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(13320, 5)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.shape"
]
},
{
"cell_type": "markdown",
"id": "da19f381",
"metadata": {},
"source": [
"### Data Cleaning: Handling NA values"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "84ffb66b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"location 1\n",
"size 16\n",
"total_sqft 0\n",
"bath 73\n",
"price 0\n",
"dtype: int64"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "c7cc2a04",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"location 0\n",
"size 0\n",
"total_sqft 0\n",
"bath 0\n",
"price 0\n",
"dtype: int64"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3=df2.dropna()\n",
"df3.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "1a0f5f5d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(13246, 5)"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3.shape"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "618deabb",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Dinesh_vivobook\\AppData\\Local\\Temp\\ipykernel_6088\\1261436634.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df3['bhk']=df3['size'].apply(lambda x: int(x.split(' ')[0]))\n"
]
},
{
"data": {
"text/plain": [
"array([ 2, 4, 3, 6, 1, 8, 7, 5, 11, 9, 27, 10, 19, 16, 43, 14, 12,\n",
" 13, 18], dtype=int64)"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3['bhk']=df3['size'].apply(lambda x: int(x.split(' ')[0]))\n",
"df3.bhk.unique()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "c027ea7e",
"metadata": {},
"outputs": [],
"source": [
"def is_float(x):\n",
" try:\n",
" float(x)\n",
" except:\n",
" return False\n",
" return True"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "506c5d8e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" location | \n",
" size | \n",
" total_sqft | \n",
" bath | \n",
" price | \n",
" bhk | \n",
"
\n",
" \n",
" \n",
" \n",
" 30 | \n",
" Yelahanka | \n",
" 4 BHK | \n",
" 2100 - 2850 | \n",
" 4.0 | \n",
" 186.000 | \n",
" 4 | \n",
"
\n",
" \n",
" 122 | \n",
" Hebbal | \n",
" 4 BHK | \n",
" 3067 - 8156 | \n",
" 4.0 | \n",
" 477.000 | \n",
" 4 | \n",
"
\n",
" \n",
" 137 | \n",
" 8th Phase JP Nagar | \n",
" 2 BHK | \n",
" 1042 - 1105 | \n",
" 2.0 | \n",
" 54.005 | \n",
" 2 | \n",
"
\n",
" \n",
" 165 | \n",
" Sarjapur | \n",
" 2 BHK | \n",
" 1145 - 1340 | \n",
" 2.0 | \n",
" 43.490 | \n",
" 2 | \n",
"
\n",
" \n",
" 188 | \n",
" KR Puram | \n",
" 2 BHK | \n",
" 1015 - 1540 | \n",
" 2.0 | \n",
" 56.800 | \n",
" 2 | \n",
"
\n",
" \n",
" 410 | \n",
" Kengeri | \n",
" 1 BHK | \n",
" 34.46Sq. Meter | \n",
" 1.0 | \n",
" 18.500 | \n",
" 1 | \n",
"
\n",
" \n",
" 549 | \n",
" Hennur Road | \n",
" 2 BHK | \n",
" 1195 - 1440 | \n",
" 2.0 | \n",
" 63.770 | \n",
" 2 | \n",
"
\n",
" \n",
" 648 | \n",
" Arekere | \n",
" 9 Bedroom | \n",
" 4125Perch | \n",
" 9.0 | \n",
" 265.000 | \n",
" 9 | \n",
"
\n",
" \n",
" 661 | \n",
" Yelahanka | \n",
" 2 BHK | \n",
" 1120 - 1145 | \n",
" 2.0 | \n",
" 48.130 | \n",
" 2 | \n",
"
\n",
" \n",
" 672 | \n",
" Bettahalsoor | \n",
" 4 Bedroom | \n",
" 3090 - 5002 | \n",
" 4.0 | \n",
" 445.000 | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" location size total_sqft bath price bhk\n",
"30 Yelahanka 4 BHK 2100 - 2850 4.0 186.000 4\n",
"122 Hebbal 4 BHK 3067 - 8156 4.0 477.000 4\n",
"137 8th Phase JP Nagar 2 BHK 1042 - 1105 2.0 54.005 2\n",
"165 Sarjapur 2 BHK 1145 - 1340 2.0 43.490 2\n",
"188 KR Puram 2 BHK 1015 - 1540 2.0 56.800 2\n",
"410 Kengeri 1 BHK 34.46Sq. Meter 1.0 18.500 1\n",
"549 Hennur Road 2 BHK 1195 - 1440 2.0 63.770 2\n",
"648 Arekere 9 Bedroom 4125Perch 9.0 265.000 9\n",
"661 Yelahanka 2 BHK 1120 - 1145 2.0 48.130 2\n",
"672 Bettahalsoor 4 Bedroom 3090 - 5002 4.0 445.000 4"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3[~df3['total_sqft'].apply(is_float)].head(10)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "53cb2ebd",
"metadata": {},
"outputs": [],
"source": [
"def convert_sqft_to_num(x):\n",
" tokens=x.split('-')\n",
" if(len(tokens)==2):\n",
" return (float(tokens[0])+float(tokens[1]))/2\n",
" try:\n",
" return float(x)\n",
" except:\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "990186c4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" location | \n",
" size | \n",
" total_sqft | \n",
" bath | \n",
" price | \n",
" bhk | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Electronic City Phase II | \n",
" 2 BHK | \n",
" 1056.0 | \n",
" 2.0 | \n",
" 39.07 | \n",
" 2 | \n",
"
\n",
" \n",
" 1 | \n",
" Chikka Tirupathi | \n",
" 4 Bedroom | \n",
" 2600.0 | \n",
" 5.0 | \n",
" 120.00 | \n",
" 4 | \n",
"
\n",
" \n",
" 2 | \n",
" Uttarahalli | \n",
" 3 BHK | \n",
" 1440.0 | \n",
" 2.0 | \n",
" 62.00 | \n",
" 3 | \n",
"
\n",
" \n",
" 3 | \n",
" Lingadheeranahalli | \n",
" 3 BHK | \n",
" 1521.0 | \n",
" 3.0 | \n",
" 95.00 | \n",
" 3 | \n",
"
\n",
" \n",
" 4 | \n",
" Kothanur | \n",
" 2 BHK | \n",
" 1200.0 | \n",
" 2.0 | \n",
" 51.00 | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" location size total_sqft bath price bhk\n",
"0 Electronic City Phase II 2 BHK 1056.0 2.0 39.07 2\n",
"1 Chikka Tirupathi 4 Bedroom 2600.0 5.0 120.00 4\n",
"2 Uttarahalli 3 BHK 1440.0 2.0 62.00 3\n",
"3 Lingadheeranahalli 3 BHK 1521.0 3.0 95.00 3\n",
"4 Kothanur 2 BHK 1200.0 2.0 51.00 2"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df4=df3.copy()\n",
"df4.total_sqft=df4.total_sqft.apply(convert_sqft_to_num)\n",
"df4=df4[df4.total_sqft.notnull()]\n",
"df4.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "7f1b0177",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" location | \n",
" size | \n",
" total_sqft | \n",
" bath | \n",
" price | \n",
" bhk | \n",
" price_per_sqft | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Electronic City Phase II | \n",
" 2 BHK | \n",
" 1056.0 | \n",
" 2.0 | \n",
" 39.07 | \n",
" 2 | \n",
" 3699.810606 | \n",
"
\n",
" \n",
" 1 | \n",
" Chikka Tirupathi | \n",
" 4 Bedroom | \n",
" 2600.0 | \n",
" 5.0 | \n",
" 120.00 | \n",
" 4 | \n",
" 4615.384615 | \n",
"
\n",
" \n",
" 2 | \n",
" Uttarahalli | \n",
" 3 BHK | \n",
" 1440.0 | \n",
" 2.0 | \n",
" 62.00 | \n",
" 3 | \n",
" 4305.555556 | \n",
"
\n",
" \n",
" 3 | \n",
" Lingadheeranahalli | \n",
" 3 BHK | \n",
" 1521.0 | \n",
" 3.0 | \n",
" 95.00 | \n",
" 3 | \n",
" 6245.890861 | \n",
"
\n",
" \n",
" 4 | \n",
" Kothanur | \n",
" 2 BHK | \n",
" 1200.0 | \n",
" 2.0 | \n",
" 51.00 | \n",
" 2 | \n",
" 4250.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" location size total_sqft bath price bhk \\\n",
"0 Electronic City Phase II 2 BHK 1056.0 2.0 39.07 2 \n",
"1 Chikka Tirupathi 4 Bedroom 2600.0 5.0 120.00 4 \n",
"2 Uttarahalli 3 BHK 1440.0 2.0 62.00 3 \n",
"3 Lingadheeranahalli 3 BHK 1521.0 3.0 95.00 3 \n",
"4 Kothanur 2 BHK 1200.0 2.0 51.00 2 \n",
"\n",
" price_per_sqft \n",
"0 3699.810606 \n",
"1 4615.384615 \n",
"2 4305.555556 \n",
"3 6245.890861 \n",
"4 4250.000000 "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df5=df4.copy()\n",
"df5['price_per_sqft']=df5['price']*100000/df5['total_sqft']\n",
"df5.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "ad0a2319",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 1.320000e+04\n",
"mean 7.920759e+03\n",
"std 1.067272e+05\n",
"min 2.678298e+02\n",
"25% 4.267701e+03\n",
"50% 5.438331e+03\n",
"75% 7.317073e+03\n",
"max 1.200000e+07\n",
"Name: price_per_sqft, dtype: float64"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df5_stats=df5['price_per_sqft'].describe()\n",
"df5_stats"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "fa41ad05",
"metadata": {},
"outputs": [],
"source": [
"df5.to_csv(\"bhp.csv\",index=False)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "17d613ae",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(13200, 7)"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df5.shape"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "9cc20326",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Whitefield 533\n",
"Sarjapur Road 392\n",
"Electronic City 304\n",
"Kanakpura Road 264\n",
"Thanisandra 235\n",
" ... \n",
"Rajanna Layout 1\n",
"Subramanyanagar 1\n",
"Lakshmipura Vidyaanyapura 1\n",
"Malur Hosur Road 1\n",
"Abshot Layout 1\n",
"Name: location, Length: 1287, dtype: int64"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df5.location = df5.location.apply(lambda x: x.strip())\n",
"location_stats = df5['location'].value_counts(ascending=False)\n",
"location_stats"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "b0806611",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"13200"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"location_stats.values.sum()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "d9984a8b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"240"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(location_stats[location_stats>10])"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "59cac967",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1287"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(location_stats)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "2fe2b271",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1047"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(location_stats[location_stats<=10])"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "f3cddd30",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BTM 1st Stage 10\n",
"Gunjur Palya 10\n",
"Nagappa Reddy Layout 10\n",
"Sector 1 HSR Layout 10\n",
"Thyagaraja Nagar 10\n",
" ..\n",
"Rajanna Layout 1\n",
"Subramanyanagar 1\n",
"Lakshmipura Vidyaanyapura 1\n",
"Malur Hosur Road 1\n",
"Abshot Layout 1\n",
"Name: location, Length: 1047, dtype: int64"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"location_stats_less_than_10 = location_stats[location_stats<=10]\n",
"location_stats_less_than_10"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "6be6b2e6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1287"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(df5.location.unique())"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "5a9ea722",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"241"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df5.location = df5.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)\n",
"len(df5.location.unique())"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "bccc5a57",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" location | \n",
" size | \n",
" total_sqft | \n",
" bath | \n",
" price | \n",
" bhk | \n",
" price_per_sqft | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Electronic City Phase II | \n",
" 2 BHK | \n",
" 1056.0 | \n",
" 2.0 | \n",
" 39.07 | \n",
" 2 | \n",
" 3699.810606 | \n",
"
\n",
" \n",
" 1 | \n",
" Chikka Tirupathi | \n",
" 4 Bedroom | \n",
" 2600.0 | \n",
" 5.0 | \n",
" 120.00 | \n",
" 4 | \n",
" 4615.384615 | \n",
"
\n",
" \n",
" 2 | \n",
" Uttarahalli | \n",
" 3 BHK | \n",
" 1440.0 | \n",
" 2.0 | \n",
" 62.00 | \n",
" 3 | \n",
" 4305.555556 | \n",
"
\n",
" \n",
" 3 | \n",
" Lingadheeranahalli | \n",
" 3 BHK | \n",
" 1521.0 | \n",
" 3.0 | \n",
" 95.00 | \n",
" 3 | \n",
" 6245.890861 | \n",
"
\n",
" \n",
" 4 | \n",
" Kothanur | \n",
" 2 BHK | \n",
" 1200.0 | \n",
" 2.0 | \n",
" 51.00 | \n",
" 2 | \n",
" 4250.000000 | \n",
"
\n",
" \n",
" 5 | \n",
" Whitefield | \n",
" 2 BHK | \n",
" 1170.0 | \n",
" 2.0 | \n",
" 38.00 | \n",
" 2 | \n",
" 3247.863248 | \n",
"
\n",
" \n",
" 6 | \n",
" Old Airport Road | \n",
" 4 BHK | \n",
" 2732.0 | \n",
" 4.0 | \n",
" 204.00 | \n",
" 4 | \n",
" 7467.057101 | \n",
"
\n",
" \n",
" 7 | \n",
" Rajaji Nagar | \n",
" 4 BHK | \n",
" 3300.0 | \n",
" 4.0 | \n",
" 600.00 | \n",
" 4 | \n",
" 18181.818182 | \n",
"
\n",
" \n",
" 8 | \n",
" Marathahalli | \n",
" 3 BHK | \n",
" 1310.0 | \n",
" 3.0 | \n",
" 63.25 | \n",
" 3 | \n",
" 4828.244275 | \n",
"
\n",
" \n",
" 9 | \n",
" other | \n",
" 6 Bedroom | \n",
" 1020.0 | \n",
" 6.0 | \n",
" 370.00 | \n",
" 6 | \n",
" 36274.509804 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" location size total_sqft bath price bhk \\\n",
"0 Electronic City Phase II 2 BHK 1056.0 2.0 39.07 2 \n",
"1 Chikka Tirupathi 4 Bedroom 2600.0 5.0 120.00 4 \n",
"2 Uttarahalli 3 BHK 1440.0 2.0 62.00 3 \n",
"3 Lingadheeranahalli 3 BHK 1521.0 3.0 95.00 3 \n",
"4 Kothanur 2 BHK 1200.0 2.0 51.00 2 \n",
"5 Whitefield 2 BHK 1170.0 2.0 38.00 2 \n",
"6 Old Airport Road 4 BHK 2732.0 4.0 204.00 4 \n",
"7 Rajaji Nagar 4 BHK 3300.0 4.0 600.00 4 \n",
"8 Marathahalli 3 BHK 1310.0 3.0 63.25 3 \n",
"9 other 6 Bedroom 1020.0 6.0 370.00 6 \n",
"\n",
" price_per_sqft \n",
"0 3699.810606 \n",
"1 4615.384615 \n",
"2 4305.555556 \n",
"3 6245.890861 \n",
"4 4250.000000 \n",
"5 3247.863248 \n",
"6 7467.057101 \n",
"7 18181.818182 \n",
"8 4828.244275 \n",
"9 36274.509804 "
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df5.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "8bc27536",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" location | \n",
" size | \n",
" total_sqft | \n",
" bath | \n",
" price | \n",
" bhk | \n",
" price_per_sqft | \n",
"
\n",
" \n",
" \n",
" \n",
" 9 | \n",
" other | \n",
" 6 Bedroom | \n",
" 1020.0 | \n",
" 6.0 | \n",
" 370.0 | \n",
" 6 | \n",
" 36274.509804 | \n",
"
\n",
" \n",
" 45 | \n",
" HSR Layout | \n",
" 8 Bedroom | \n",
" 600.0 | \n",
" 9.0 | \n",
" 200.0 | \n",
" 8 | \n",
" 33333.333333 | \n",
"
\n",
" \n",
" 58 | \n",
" Murugeshpalya | \n",
" 6 Bedroom | \n",
" 1407.0 | \n",
" 4.0 | \n",
" 150.0 | \n",
" 6 | \n",
" 10660.980810 | \n",
"
\n",
" \n",
" 68 | \n",
" Devarachikkanahalli | \n",
" 8 Bedroom | \n",
" 1350.0 | \n",
" 7.0 | \n",
" 85.0 | \n",
" 8 | \n",
" 6296.296296 | \n",
"
\n",
" \n",
" 70 | \n",
" other | \n",
" 3 Bedroom | \n",
" 500.0 | \n",
" 3.0 | \n",
" 100.0 | \n",
" 3 | \n",
" 20000.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" location size total_sqft bath price bhk \\\n",
"9 other 6 Bedroom 1020.0 6.0 370.0 6 \n",
"45 HSR Layout 8 Bedroom 600.0 9.0 200.0 8 \n",
"58 Murugeshpalya 6 Bedroom 1407.0 4.0 150.0 6 \n",
"68 Devarachikkanahalli 8 Bedroom 1350.0 7.0 85.0 8 \n",
"70 other 3 Bedroom 500.0 3.0 100.0 3 \n",
"\n",
" price_per_sqft \n",
"9 36274.509804 \n",
"45 33333.333333 \n",
"58 10660.980810 \n",
"68 6296.296296 \n",
"70 20000.000000 "
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df5[df5.total_sqft/df5.bhk<300].head()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "091e3fda",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(13200, 7)"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df5.shape"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "458efc85",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(12456, 7)"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df6 = df5[~(df5.total_sqft/df5.bhk<300)]\n",
"df6.shape"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "d3da79d6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 12456.000000\n",
"mean 6308.502826\n",
"std 4168.127339\n",
"min 267.829813\n",
"25% 4210.526316\n",
"50% 5294.117647\n",
"75% 6916.666667\n",
"max 176470.588235\n",
"Name: price_per_sqft, dtype: float64"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df6.price_per_sqft.describe()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "fb2efbc6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(10242, 7)"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def remove_pps_outliers(df):\n",
" df_out = pd.DataFrame()\n",
" for key, subdf in df.groupby('location'):\n",
" m = np.mean(subdf.price_per_sqft)\n",
" st = np.std(subdf.price_per_sqft)\n",
" reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]\n",
" df_out = pd.concat([df_out,reduced_df],ignore_index=True)\n",
" return df_out\n",
"df7 = remove_pps_outliers(df6)\n",
"df7.shape"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "c5bba9cf",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"matplotlib.rcParams[\"figure.figsize\"] = (20,10)\n",
"def plot_scatter_chart(df,location):\n",
" bhk2 = df[(df.location==location) & (df.bhk==2)]\n",
" bhk3 = df[(df.location==location) & (df.bhk==3)]\n",
" matplotlib.rcParams['figure.figsize'] = (15,10)\n",
" plt.scatter(bhk2.total_sqft,bhk2.price,color='blue',label='2 BHK', s=50)\n",
" plt.scatter(bhk3.total_sqft,bhk3.price,marker='+', color='green',label='3 BHK', s=50)\n",
" plt.xlabel(\"Total Square Feet Area\")\n",
" plt.ylabel(\"Price (Lakh Indian Rupees)\")\n",
" plt.title(location)\n",
" plt.legend()\n",
" \n",
"plot_scatter_chart(df7,\"Rajaji Nagar\")"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "013b9b86",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7317, 7)"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def remove_bhk_outliers(df):\n",
" exclude_indices = np.array([])\n",
" for location, location_df in df.groupby('location'):\n",
" bhk_stats = {}\n",
" for bhk, bhk_df in location_df.groupby('bhk'):\n",
" bhk_stats[bhk] = {\n",
" 'mean': np.mean(bhk_df.price_per_sqft),\n",
" 'std': np.std(bhk_df.price_per_sqft),\n",
" 'count': bhk_df.shape[0]\n",
" }\n",
" for bhk, bhk_df in location_df.groupby('bhk'):\n",
" stats = bhk_stats.get(bhk-1)\n",
" if stats and stats['count']>5:\n",
" exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)\n",
" return df.drop(exclude_indices,axis='index')\n",
"df8 = remove_bhk_outliers(df7)\n",
"# df8 = df7.copy()\n",
"df8.shape"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "534f884f",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plot_scatter_chart(df8,\"Rajaji Nagar\")"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "ee8e044e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0, 0.5, 'Count')"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib\n",
"matplotlib.rcParams[\"figure.figsize\"] = (20,10)\n",
"plt.hist(df8.price_per_sqft,rwidth=0.8)\n",
"plt.xlabel(\"Price Per Square Feet\")\n",
"plt.ylabel(\"Count\")"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "a26184e1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 4., 3., 2., 5., 8., 1., 6., 7., 9., 12., 16., 13.])"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df8.bath.unique()"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "38bc530e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0, 0.5, 'Count')"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.hist(df8.bath,rwidth=0.8)\n",
"plt.xlabel(\"Number of bathrooms\")\n",
"plt.ylabel(\"Count\")"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "65fd495f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" location | \n",
" size | \n",
" total_sqft | \n",
" bath | \n",
" price | \n",
" bhk | \n",
" price_per_sqft | \n",
"
\n",
" \n",
" \n",
" \n",
" 5277 | \n",
" Neeladri Nagar | \n",
" 10 BHK | \n",
" 4000.0 | \n",
" 12.0 | \n",
" 160.0 | \n",
" 10 | \n",
" 4000.000000 | \n",
"
\n",
" \n",
" 8483 | \n",
" other | \n",
" 10 BHK | \n",
" 12000.0 | \n",
" 12.0 | \n",
" 525.0 | \n",
" 10 | \n",
" 4375.000000 | \n",
"
\n",
" \n",
" 8572 | \n",
" other | \n",
" 16 BHK | \n",
" 10000.0 | \n",
" 16.0 | \n",
" 550.0 | \n",
" 16 | \n",
" 5500.000000 | \n",
"
\n",
" \n",
" 9306 | \n",
" other | \n",
" 11 BHK | \n",
" 6000.0 | \n",
" 12.0 | \n",
" 150.0 | \n",
" 11 | \n",
" 2500.000000 | \n",
"
\n",
" \n",
" 9637 | \n",
" other | \n",
" 13 BHK | \n",
" 5425.0 | \n",
" 13.0 | \n",
" 275.0 | \n",
" 13 | \n",
" 5069.124424 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" location size total_sqft bath price bhk price_per_sqft\n",
"5277 Neeladri Nagar 10 BHK 4000.0 12.0 160.0 10 4000.000000\n",
"8483 other 10 BHK 12000.0 12.0 525.0 10 4375.000000\n",
"8572 other 16 BHK 10000.0 16.0 550.0 16 5500.000000\n",
"9306 other 11 BHK 6000.0 12.0 150.0 11 2500.000000\n",
"9637 other 13 BHK 5425.0 13.0 275.0 13 5069.124424"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df8[df8.bath>10]"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "07bd7517",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" location | \n",
" size | \n",
" total_sqft | \n",
" bath | \n",
" price | \n",
" bhk | \n",
" price_per_sqft | \n",
"
\n",
" \n",
" \n",
" \n",
" 1626 | \n",
" Chikkabanavar | \n",
" 4 Bedroom | \n",
" 2460.0 | \n",
" 7.0 | \n",
" 80.0 | \n",
" 4 | \n",
" 3252.032520 | \n",
"
\n",
" \n",
" 5238 | \n",
" Nagasandra | \n",
" 4 Bedroom | \n",
" 7000.0 | \n",
" 8.0 | \n",
" 450.0 | \n",
" 4 | \n",
" 6428.571429 | \n",
"
\n",
" \n",
" 6711 | \n",
" Thanisandra | \n",
" 3 BHK | \n",
" 1806.0 | \n",
" 6.0 | \n",
" 116.0 | \n",
" 3 | \n",
" 6423.034330 | \n",
"
\n",
" \n",
" 8408 | \n",
" other | \n",
" 6 BHK | \n",
" 11338.0 | \n",
" 9.0 | \n",
" 1000.0 | \n",
" 6 | \n",
" 8819.897689 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" location size total_sqft bath price bhk price_per_sqft\n",
"1626 Chikkabanavar 4 Bedroom 2460.0 7.0 80.0 4 3252.032520\n",
"5238 Nagasandra 4 Bedroom 7000.0 8.0 450.0 4 6428.571429\n",
"6711 Thanisandra 3 BHK 1806.0 6.0 116.0 3 6423.034330\n",
"8408 other 6 BHK 11338.0 9.0 1000.0 6 8819.897689"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df8[df8.bath>df8.bhk+2]"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "0b16203c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7239, 7)"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df9 = df8[df8.bath\n",
"\n",
"\n",
" \n",
" \n",
" | \n",
" location | \n",
" size | \n",
" total_sqft | \n",
" bath | \n",
" price | \n",
" bhk | \n",
" price_per_sqft | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1st Block Jayanagar | \n",
" 4 BHK | \n",
" 2850.0 | \n",
" 4.0 | \n",
" 428.0 | \n",
" 4 | \n",
" 15017.543860 | \n",
"
\n",
" \n",
" 1 | \n",
" 1st Block Jayanagar | \n",
" 3 BHK | \n",
" 1630.0 | \n",
" 3.0 | \n",
" 194.0 | \n",
" 3 | \n",
" 11901.840491 | \n",
"
\n",
" \n",
"
\n",
""
],
"text/plain": [
" location size total_sqft bath price bhk price_per_sqft\n",
"0 1st Block Jayanagar 4 BHK 2850.0 4.0 428.0 4 15017.543860\n",
"1 1st Block Jayanagar 3 BHK 1630.0 3.0 194.0 3 11901.840491"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df9.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "d61cfae3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" location | \n",
" total_sqft | \n",
" bath | \n",
" price | \n",
" bhk | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1st Block Jayanagar | \n",
" 2850.0 | \n",
" 4.0 | \n",
" 428.0 | \n",
" 4 | \n",
"
\n",
" \n",
" 1 | \n",
" 1st Block Jayanagar | \n",
" 1630.0 | \n",
" 3.0 | \n",
" 194.0 | \n",
" 3 | \n",
"
\n",
" \n",
" 2 | \n",
" 1st Block Jayanagar | \n",
" 1875.0 | \n",
" 2.0 | \n",
" 235.0 | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" location total_sqft bath price bhk\n",
"0 1st Block Jayanagar 2850.0 4.0 428.0 4\n",
"1 1st Block Jayanagar 1630.0 3.0 194.0 3\n",
"2 1st Block Jayanagar 1875.0 2.0 235.0 3"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df10 = df9.drop(['size','price_per_sqft'],axis='columns')\n",
"df10.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "7da53193",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 1st Block Jayanagar | \n",
" 1st Phase JP Nagar | \n",
" 2nd Phase Judicial Layout | \n",
" 2nd Stage Nagarbhavi | \n",
" 5th Block Hbr Layout | \n",
" 5th Phase JP Nagar | \n",
" 6th Phase JP Nagar | \n",
" 7th Phase JP Nagar | \n",
" 8th Phase JP Nagar | \n",
" 9th Phase JP Nagar | \n",
" ... | \n",
" Vishveshwarya Layout | \n",
" Vishwapriya Layout | \n",
" Vittasandra | \n",
" Whitefield | \n",
" Yelachenahalli | \n",
" Yelahanka | \n",
" Yelahanka New Town | \n",
" Yelenahalli | \n",
" Yeshwanthpur | \n",
" other | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
3 rows × 241 columns
\n",
"
"
],
"text/plain": [
" 1st Block Jayanagar 1st Phase JP Nagar 2nd Phase Judicial Layout \\\n",
"0 1 0 0 \n",
"1 1 0 0 \n",
"2 1 0 0 \n",
"\n",
" 2nd Stage Nagarbhavi 5th Block Hbr Layout 5th Phase JP Nagar \\\n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 0 0 0 \n",
"\n",
" 6th Phase JP Nagar 7th Phase JP Nagar 8th Phase JP Nagar \\\n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 0 0 0 \n",
"\n",
" 9th Phase JP Nagar ... Vishveshwarya Layout Vishwapriya Layout \\\n",
"0 0 ... 0 0 \n",
"1 0 ... 0 0 \n",
"2 0 ... 0 0 \n",
"\n",
" Vittasandra Whitefield Yelachenahalli Yelahanka Yelahanka New Town \\\n",
"0 0 0 0 0 0 \n",
"1 0 0 0 0 0 \n",
"2 0 0 0 0 0 \n",
"\n",
" Yelenahalli Yeshwanthpur other \n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 0 0 0 \n",
"\n",
"[3 rows x 241 columns]"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dummies = pd.get_dummies(df10.location)\n",
"dummies.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "f6b6d2bf",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" location | \n",
" total_sqft | \n",
" bath | \n",
" price | \n",
" bhk | \n",
" 1st Block Jayanagar | \n",
" 1st Phase JP Nagar | \n",
" 2nd Phase Judicial Layout | \n",
" 2nd Stage Nagarbhavi | \n",
" 5th Block Hbr Layout | \n",
" ... | \n",
" Vijayanagar | \n",
" Vishveshwarya Layout | \n",
" Vishwapriya Layout | \n",
" Vittasandra | \n",
" Whitefield | \n",
" Yelachenahalli | \n",
" Yelahanka | \n",
" Yelahanka New Town | \n",
" Yelenahalli | \n",
" Yeshwanthpur | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1st Block Jayanagar | \n",
" 2850.0 | \n",
" 4.0 | \n",
" 428.0 | \n",
" 4 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 1st Block Jayanagar | \n",
" 1630.0 | \n",
" 3.0 | \n",
" 194.0 | \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 1st Block Jayanagar | \n",
" 1875.0 | \n",
" 2.0 | \n",
" 235.0 | \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 1st Block Jayanagar | \n",
" 1200.0 | \n",
" 2.0 | \n",
" 130.0 | \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 1st Block Jayanagar | \n",
" 1235.0 | \n",
" 2.0 | \n",
" 148.0 | \n",
" 2 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 245 columns
\n",
"
"
],
"text/plain": [
" location total_sqft bath price bhk 1st Block Jayanagar \\\n",
"0 1st Block Jayanagar 2850.0 4.0 428.0 4 1 \n",
"1 1st Block Jayanagar 1630.0 3.0 194.0 3 1 \n",
"2 1st Block Jayanagar 1875.0 2.0 235.0 3 1 \n",
"3 1st Block Jayanagar 1200.0 2.0 130.0 3 1 \n",
"4 1st Block Jayanagar 1235.0 2.0 148.0 2 1 \n",
"\n",
" 1st Phase JP Nagar 2nd Phase Judicial Layout 2nd Stage Nagarbhavi \\\n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 0 0 0 \n",
"3 0 0 0 \n",
"4 0 0 0 \n",
"\n",
" 5th Block Hbr Layout ... Vijayanagar Vishveshwarya Layout \\\n",
"0 0 ... 0 0 \n",
"1 0 ... 0 0 \n",
"2 0 ... 0 0 \n",
"3 0 ... 0 0 \n",
"4 0 ... 0 0 \n",
"\n",
" Vishwapriya Layout Vittasandra Whitefield Yelachenahalli Yelahanka \\\n",
"0 0 0 0 0 0 \n",
"1 0 0 0 0 0 \n",
"2 0 0 0 0 0 \n",
"3 0 0 0 0 0 \n",
"4 0 0 0 0 0 \n",
"\n",
" Yelahanka New Town Yelenahalli Yeshwanthpur \n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 0 0 0 \n",
"3 0 0 0 \n",
"4 0 0 0 \n",
"\n",
"[5 rows x 245 columns]"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df11 = pd.concat([df10,dummies.drop('other',axis='columns')],axis='columns')\n",
"df11.head()"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "b57187c1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" total_sqft | \n",
" bath | \n",
" price | \n",
" bhk | \n",
" 1st Block Jayanagar | \n",
" 1st Phase JP Nagar | \n",
" 2nd Phase Judicial Layout | \n",
" 2nd Stage Nagarbhavi | \n",
" 5th Block Hbr Layout | \n",
" 5th Phase JP Nagar | \n",
" ... | \n",
" Vijayanagar | \n",
" Vishveshwarya Layout | \n",
" Vishwapriya Layout | \n",
" Vittasandra | \n",
" Whitefield | \n",
" Yelachenahalli | \n",
" Yelahanka | \n",
" Yelahanka New Town | \n",
" Yelenahalli | \n",
" Yeshwanthpur | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2850.0 | \n",
" 4.0 | \n",
" 428.0 | \n",
" 4 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 1630.0 | \n",
" 3.0 | \n",
" 194.0 | \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
2 rows × 244 columns
\n",
"
"
],
"text/plain": [
" total_sqft bath price bhk 1st Block Jayanagar 1st Phase JP Nagar \\\n",
"0 2850.0 4.0 428.0 4 1 0 \n",
"1 1630.0 3.0 194.0 3 1 0 \n",
"\n",
" 2nd Phase Judicial Layout 2nd Stage Nagarbhavi 5th Block Hbr Layout \\\n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"\n",
" 5th Phase JP Nagar ... Vijayanagar Vishveshwarya Layout \\\n",
"0 0 ... 0 0 \n",
"1 0 ... 0 0 \n",
"\n",
" Vishwapriya Layout Vittasandra Whitefield Yelachenahalli Yelahanka \\\n",
"0 0 0 0 0 0 \n",
"1 0 0 0 0 0 \n",
"\n",
" Yelahanka New Town Yelenahalli Yeshwanthpur \n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"\n",
"[2 rows x 244 columns]"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df12 = df11.drop('location',axis='columns')\n",
"df12.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "954b49f6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7239, 244)"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df12.shape"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "e4835adf",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" total_sqft | \n",
" bath | \n",
" bhk | \n",
" 1st Block Jayanagar | \n",
" 1st Phase JP Nagar | \n",
" 2nd Phase Judicial Layout | \n",
" 2nd Stage Nagarbhavi | \n",
" 5th Block Hbr Layout | \n",
" 5th Phase JP Nagar | \n",
" 6th Phase JP Nagar | \n",
" ... | \n",
" Vijayanagar | \n",
" Vishveshwarya Layout | \n",
" Vishwapriya Layout | \n",
" Vittasandra | \n",
" Whitefield | \n",
" Yelachenahalli | \n",
" Yelahanka | \n",
" Yelahanka New Town | \n",
" Yelenahalli | \n",
" Yeshwanthpur | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2850.0 | \n",
" 4.0 | \n",
" 4 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 1630.0 | \n",
" 3.0 | \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 1875.0 | \n",
" 2.0 | \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
3 rows × 243 columns
\n",
"
"
],
"text/plain": [
" total_sqft bath bhk 1st Block Jayanagar 1st Phase JP Nagar \\\n",
"0 2850.0 4.0 4 1 0 \n",
"1 1630.0 3.0 3 1 0 \n",
"2 1875.0 2.0 3 1 0 \n",
"\n",
" 2nd Phase Judicial Layout 2nd Stage Nagarbhavi 5th Block Hbr Layout \\\n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 0 0 0 \n",
"\n",
" 5th Phase JP Nagar 6th Phase JP Nagar ... Vijayanagar \\\n",
"0 0 0 ... 0 \n",
"1 0 0 ... 0 \n",
"2 0 0 ... 0 \n",
"\n",
" Vishveshwarya Layout Vishwapriya Layout Vittasandra Whitefield \\\n",
"0 0 0 0 0 \n",
"1 0 0 0 0 \n",
"2 0 0 0 0 \n",
"\n",
" Yelachenahalli Yelahanka Yelahanka New Town Yelenahalli Yeshwanthpur \n",
"0 0 0 0 0 0 \n",
"1 0 0 0 0 0 \n",
"2 0 0 0 0 0 \n",
"\n",
"[3 rows x 243 columns]"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X = df12.drop(['price'],axis='columns')\n",
"X.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "f803bc44",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7239, 243)"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.shape"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "3f722430",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 428.0\n",
"1 194.0\n",
"2 235.0\n",
"Name: price, dtype: float64"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y = df12.price\n",
"y.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "3d6fddb3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"7239"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(y)"
]
},
{
"cell_type": "code",
"execution_count": 82,
"id": "26df7c7c",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.8,random_state=13)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "ca73765a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8588660477968386"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.linear_model import LinearRegression\n",
"lr_clf = LinearRegression()\n",
"lr_clf.fit(X_train,y_train)\n",
"lr_clf.score(X_test,y_test)"
]
},
{
"cell_type": "code",
"execution_count": 86,
"id": "ee5f5b93",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.82702546, 0.86027005, 0.85322178, 0.8436466 , 0.85481502])"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.model_selection import ShuffleSplit\n",
"from sklearn.model_selection import cross_val_score\n",
"cv=ShuffleSplit(n_splits=5,test_size=0.2,random_state=0)\n",
"cross_val_score(LinearRegression(),X,y,cv=cv)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "41bd8610",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" model | \n",
" best_score | \n",
" best_params | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" linear_regression | \n",
" 0.847796 | \n",
" {'normalize': False} | \n",
"
\n",
" \n",
" 1 | \n",
" lasso | \n",
" 0.726861 | \n",
" {'alpha': 2, 'selection': 'random'} | \n",
"
\n",
" \n",
" 2 | \n",
" decision_tree | \n",
" 0.718810 | \n",
" {'criterion': 'mse', 'splitter': 'random'} | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" model best_score best_params\n",
"0 linear_regression 0.847796 {'normalize': False}\n",
"1 lasso 0.726861 {'alpha': 2, 'selection': 'random'}\n",
"2 decision_tree 0.718810 {'criterion': 'mse', 'splitter': 'random'}"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.linear_model import Lasso\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"def find_best_model_using_gridsearchcv(X,y):\n",
" algos = {\n",
" 'linear_regression' : {\n",
" 'model': LinearRegression(),\n",
" 'params': {\n",
" 'normalize': [True, False]\n",
" }\n",
" },\n",
" 'lasso': {\n",
" 'model': Lasso(),\n",
" 'params': {\n",
" 'alpha': [1,2],\n",
" 'selection': ['random', 'cyclic']\n",
" }\n",
" },\n",
" 'decision_tree': {\n",
" 'model': DecisionTreeRegressor(),\n",
" 'params': {\n",
" 'criterion' : ['mse','friedman_mse'],\n",
" 'splitter': ['best','random']\n",
" }\n",
" }\n",
" }\n",
" scores = []\n",
" cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)\n",
" for algo_name, config in algos.items():\n",
" gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)\n",
" gs.fit(X,y)\n",
" scores.append({\n",
" 'model': algo_name,\n",
" 'best_score': gs.best_score_,\n",
" 'best_params': gs.best_params_\n",
" })\n",
"\n",
" return pd.DataFrame(scores,columns=['model','best_score','best_params'])\n",
"\n",
"find_best_model_using_gridsearchcv(X,y)\n"
]
},
{
"cell_type": "code",
"execution_count": 91,
"id": "1e2267ce",
"metadata": {},
"outputs": [],
"source": [
"def predict_price(location,sqft,bath,bhk): \n",
" loc_index = np.where(X.columns==location)[0][0]\n",
"\n",
" x = np.zeros(len(X.columns))\n",
" x[0] = sqft\n",
" x[1] = bath\n",
" x[2] = bhk\n",
" if loc_index >= 0:\n",
" x[loc_index] = 1\n",
"\n",
" return lr_clf.predict([x])[0]"
]
},
{
"cell_type": "code",
"execution_count": 92,
"id": "92132555",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"90.56832868368073"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict_price('1st Phase JP Nagar',1000, 2, 2)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "93e6427d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"93.52384203792523"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict_price('1st Phase JP Nagar',1000, 3, 3)"
]
},
{
"cell_type": "code",
"execution_count": 94,
"id": "d2c3bee7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"184.177406050691"
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict_price('Indira Nagar',1000, 3, 3)"
]
},
{
"cell_type": "code",
"execution_count": 95,
"id": "afae9289",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"181.2218926964465"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predict_price('Indira Nagar',1000, 2, 2)"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "c3eee16b",
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"with open('banglore_home_prices_model.pickle','wb') as f:\n",
" pickle.dump(lr_clf,f)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"id": "f5757ab7",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"columns={\n",
" 'data_columns':[col.lower() for col in X.columns]\n",
"}\n",
"with open('columns.json','w') as f:\n",
" f.write(json.dumps(columns))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}