{ "cells": [ { "cell_type": "code", "execution_count": 89, "id": "2668805c", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "import matplotlib\n", "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", "execution_count": 3, "id": "2566dffa", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
area_typeavailabilitylocationsizesocietytotal_sqftbathbalconyprice
0Super built-up Area19-DecElectronic City Phase II2 BHKCoomee10562.01.039.07
1Plot AreaReady To MoveChikka Tirupathi4 BedroomTheanmp26005.03.0120.00
2Built-up AreaReady To MoveUttarahalli3 BHKNaN14402.03.062.00
3Super built-up AreaReady To MoveLingadheeranahalli3 BHKSoiewre15213.01.095.00
4Super built-up AreaReady To MoveKothanur2 BHKNaN12002.01.051.00
\n", "
" ], "text/plain": [ " area_type availability location size \\\n", "0 Super built-up Area 19-Dec Electronic City Phase II 2 BHK \n", "1 Plot Area Ready To Move Chikka Tirupathi 4 Bedroom \n", "2 Built-up Area Ready To Move Uttarahalli 3 BHK \n", "3 Super built-up Area Ready To Move Lingadheeranahalli 3 BHK \n", "4 Super built-up Area Ready To Move Kothanur 2 BHK \n", "\n", " society total_sqft bath balcony price \n", "0 Coomee 1056 2.0 1.0 39.07 \n", "1 Theanmp 2600 5.0 3.0 120.00 \n", "2 NaN 1440 2.0 3.0 62.00 \n", "3 Soiewre 1521 3.0 1.0 95.00 \n", "4 NaN 1200 2.0 1.0 51.00 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1=pd.read_csv('bengaluru_house_prices.csv')\n", "df1.head()" ] }, { "cell_type": "code", "execution_count": 4, "id": "141f3c7b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(13320, 9)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.shape" ] }, { "cell_type": "code", "execution_count": 5, "id": "a666c8e7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['area_type', 'availability', 'location', 'size', 'society',\n", " 'total_sqft', 'bath', 'balcony', 'price'],\n", " dtype='object')" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.columns" ] }, { "cell_type": "code", "execution_count": 6, "id": "c4b78e77", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Super built-up Area', 'Plot Area', 'Built-up Area',\n", " 'Carpet Area'], dtype=object)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1['area_type'].unique()" ] }, { "cell_type": "code", "execution_count": 7, "id": "d2b1286a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Super built-up Area 8790\n", "Built-up Area 2418\n", "Plot Area 2025\n", "Carpet Area 87\n", "Name: area_type, dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1['area_type'].value_counts()" ] }, { "cell_type": "code", "execution_count": 8, "id": "4776cabc", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathprice
0Electronic City Phase II2 BHK10562.039.07
1Chikka Tirupathi4 Bedroom26005.0120.00
2Uttarahalli3 BHK14402.062.00
3Lingadheeranahalli3 BHK15213.095.00
4Kothanur2 BHK12002.051.00
\n", "
" ], "text/plain": [ " location size total_sqft bath price\n", "0 Electronic City Phase II 2 BHK 1056 2.0 39.07\n", "1 Chikka Tirupathi 4 Bedroom 2600 5.0 120.00\n", "2 Uttarahalli 3 BHK 1440 2.0 62.00\n", "3 Lingadheeranahalli 3 BHK 1521 3.0 95.00\n", "4 Kothanur 2 BHK 1200 2.0 51.00" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2=df1.drop(['area_type','availability','society','balcony'],axis='columns')\n", "df2.head()" ] }, { "cell_type": "code", "execution_count": 9, "id": "b971be8c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(13320, 5)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.shape" ] }, { "cell_type": "markdown", "id": "da19f381", "metadata": {}, "source": [ "### Data Cleaning: Handling NA values" ] }, { "cell_type": "code", "execution_count": 15, "id": "84ffb66b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "location 1\n", "size 16\n", "total_sqft 0\n", "bath 73\n", "price 0\n", "dtype: int64" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 16, "id": "c7cc2a04", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "location 0\n", "size 0\n", "total_sqft 0\n", "bath 0\n", "price 0\n", "dtype: int64" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3=df2.dropna()\n", "df3.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 17, "id": "1a0f5f5d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(13246, 5)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3.shape" ] }, { "cell_type": "code", "execution_count": 19, "id": "618deabb", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Dinesh_vivobook\\AppData\\Local\\Temp\\ipykernel_6088\\1261436634.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df3['bhk']=df3['size'].apply(lambda x: int(x.split(' ')[0]))\n" ] }, { "data": { "text/plain": [ "array([ 2, 4, 3, 6, 1, 8, 7, 5, 11, 9, 27, 10, 19, 16, 43, 14, 12,\n", " 13, 18], dtype=int64)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3['bhk']=df3['size'].apply(lambda x: int(x.split(' ')[0]))\n", "df3.bhk.unique()" ] }, { "cell_type": "code", "execution_count": 20, "id": "c027ea7e", "metadata": {}, "outputs": [], "source": [ "def is_float(x):\n", " try:\n", " float(x)\n", " except:\n", " return False\n", " return True" ] }, { "cell_type": "code", "execution_count": 22, "id": "506c5d8e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathpricebhk
30Yelahanka4 BHK2100 - 28504.0186.0004
122Hebbal4 BHK3067 - 81564.0477.0004
1378th Phase JP Nagar2 BHK1042 - 11052.054.0052
165Sarjapur2 BHK1145 - 13402.043.4902
188KR Puram2 BHK1015 - 15402.056.8002
410Kengeri1 BHK34.46Sq. Meter1.018.5001
549Hennur Road2 BHK1195 - 14402.063.7702
648Arekere9 Bedroom4125Perch9.0265.0009
661Yelahanka2 BHK1120 - 11452.048.1302
672Bettahalsoor4 Bedroom3090 - 50024.0445.0004
\n", "
" ], "text/plain": [ " location size total_sqft bath price bhk\n", "30 Yelahanka 4 BHK 2100 - 2850 4.0 186.000 4\n", "122 Hebbal 4 BHK 3067 - 8156 4.0 477.000 4\n", "137 8th Phase JP Nagar 2 BHK 1042 - 1105 2.0 54.005 2\n", "165 Sarjapur 2 BHK 1145 - 1340 2.0 43.490 2\n", "188 KR Puram 2 BHK 1015 - 1540 2.0 56.800 2\n", "410 Kengeri 1 BHK 34.46Sq. Meter 1.0 18.500 1\n", "549 Hennur Road 2 BHK 1195 - 1440 2.0 63.770 2\n", "648 Arekere 9 Bedroom 4125Perch 9.0 265.000 9\n", "661 Yelahanka 2 BHK 1120 - 1145 2.0 48.130 2\n", "672 Bettahalsoor 4 Bedroom 3090 - 5002 4.0 445.000 4" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3[~df3['total_sqft'].apply(is_float)].head(10)" ] }, { "cell_type": "code", "execution_count": 23, "id": "53cb2ebd", "metadata": {}, "outputs": [], "source": [ "def convert_sqft_to_num(x):\n", " tokens=x.split('-')\n", " if(len(tokens)==2):\n", " return (float(tokens[0])+float(tokens[1]))/2\n", " try:\n", " return float(x)\n", " except:\n", " return None" ] }, { "cell_type": "code", "execution_count": 24, "id": "990186c4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathpricebhk
0Electronic City Phase II2 BHK1056.02.039.072
1Chikka Tirupathi4 Bedroom2600.05.0120.004
2Uttarahalli3 BHK1440.02.062.003
3Lingadheeranahalli3 BHK1521.03.095.003
4Kothanur2 BHK1200.02.051.002
\n", "
" ], "text/plain": [ " location size total_sqft bath price bhk\n", "0 Electronic City Phase II 2 BHK 1056.0 2.0 39.07 2\n", "1 Chikka Tirupathi 4 Bedroom 2600.0 5.0 120.00 4\n", "2 Uttarahalli 3 BHK 1440.0 2.0 62.00 3\n", "3 Lingadheeranahalli 3 BHK 1521.0 3.0 95.00 3\n", "4 Kothanur 2 BHK 1200.0 2.0 51.00 2" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df4=df3.copy()\n", "df4.total_sqft=df4.total_sqft.apply(convert_sqft_to_num)\n", "df4=df4[df4.total_sqft.notnull()]\n", "df4.head(5)" ] }, { "cell_type": "code", "execution_count": 25, "id": "7f1b0177", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathpricebhkprice_per_sqft
0Electronic City Phase II2 BHK1056.02.039.0723699.810606
1Chikka Tirupathi4 Bedroom2600.05.0120.0044615.384615
2Uttarahalli3 BHK1440.02.062.0034305.555556
3Lingadheeranahalli3 BHK1521.03.095.0036245.890861
4Kothanur2 BHK1200.02.051.0024250.000000
\n", "
" ], "text/plain": [ " location size total_sqft bath price bhk \\\n", "0 Electronic City Phase II 2 BHK 1056.0 2.0 39.07 2 \n", "1 Chikka Tirupathi 4 Bedroom 2600.0 5.0 120.00 4 \n", "2 Uttarahalli 3 BHK 1440.0 2.0 62.00 3 \n", "3 Lingadheeranahalli 3 BHK 1521.0 3.0 95.00 3 \n", "4 Kothanur 2 BHK 1200.0 2.0 51.00 2 \n", "\n", " price_per_sqft \n", "0 3699.810606 \n", "1 4615.384615 \n", "2 4305.555556 \n", "3 6245.890861 \n", "4 4250.000000 " ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5=df4.copy()\n", "df5['price_per_sqft']=df5['price']*100000/df5['total_sqft']\n", "df5.head()" ] }, { "cell_type": "code", "execution_count": 27, "id": "ad0a2319", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 1.320000e+04\n", "mean 7.920759e+03\n", "std 1.067272e+05\n", "min 2.678298e+02\n", "25% 4.267701e+03\n", "50% 5.438331e+03\n", "75% 7.317073e+03\n", "max 1.200000e+07\n", "Name: price_per_sqft, dtype: float64" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5_stats=df5['price_per_sqft'].describe()\n", "df5_stats" ] }, { "cell_type": "code", "execution_count": 29, "id": "fa41ad05", "metadata": {}, "outputs": [], "source": [ "df5.to_csv(\"bhp.csv\",index=False)" ] }, { "cell_type": "code", "execution_count": 31, "id": "17d613ae", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(13200, 7)" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5.shape" ] }, { "cell_type": "code", "execution_count": 30, "id": "9cc20326", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Whitefield 533\n", "Sarjapur Road 392\n", "Electronic City 304\n", "Kanakpura Road 264\n", "Thanisandra 235\n", " ... \n", "Rajanna Layout 1\n", "Subramanyanagar 1\n", "Lakshmipura Vidyaanyapura 1\n", "Malur Hosur Road 1\n", "Abshot Layout 1\n", "Name: location, Length: 1287, dtype: int64" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5.location = df5.location.apply(lambda x: x.strip())\n", "location_stats = df5['location'].value_counts(ascending=False)\n", "location_stats" ] }, { "cell_type": "code", "execution_count": 32, "id": "b0806611", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "13200" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "location_stats.values.sum()" ] }, { "cell_type": "code", "execution_count": 33, "id": "d9984a8b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "240" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(location_stats[location_stats>10])" ] }, { "cell_type": "code", "execution_count": 34, "id": "59cac967", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1287" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(location_stats)" ] }, { "cell_type": "code", "execution_count": 35, "id": "2fe2b271", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1047" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(location_stats[location_stats<=10])" ] }, { "cell_type": "code", "execution_count": 36, "id": "f3cddd30", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "BTM 1st Stage 10\n", "Gunjur Palya 10\n", "Nagappa Reddy Layout 10\n", "Sector 1 HSR Layout 10\n", "Thyagaraja Nagar 10\n", " ..\n", "Rajanna Layout 1\n", "Subramanyanagar 1\n", "Lakshmipura Vidyaanyapura 1\n", "Malur Hosur Road 1\n", "Abshot Layout 1\n", "Name: location, Length: 1047, dtype: int64" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "location_stats_less_than_10 = location_stats[location_stats<=10]\n", "location_stats_less_than_10" ] }, { "cell_type": "code", "execution_count": 37, "id": "6be6b2e6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1287" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df5.location.unique())" ] }, { "cell_type": "code", "execution_count": 38, "id": "5a9ea722", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "241" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5.location = df5.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)\n", "len(df5.location.unique())" ] }, { "cell_type": "code", "execution_count": 39, "id": "bccc5a57", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathpricebhkprice_per_sqft
0Electronic City Phase II2 BHK1056.02.039.0723699.810606
1Chikka Tirupathi4 Bedroom2600.05.0120.0044615.384615
2Uttarahalli3 BHK1440.02.062.0034305.555556
3Lingadheeranahalli3 BHK1521.03.095.0036245.890861
4Kothanur2 BHK1200.02.051.0024250.000000
5Whitefield2 BHK1170.02.038.0023247.863248
6Old Airport Road4 BHK2732.04.0204.0047467.057101
7Rajaji Nagar4 BHK3300.04.0600.00418181.818182
8Marathahalli3 BHK1310.03.063.2534828.244275
9other6 Bedroom1020.06.0370.00636274.509804
\n", "
" ], "text/plain": [ " location size total_sqft bath price bhk \\\n", "0 Electronic City Phase II 2 BHK 1056.0 2.0 39.07 2 \n", "1 Chikka Tirupathi 4 Bedroom 2600.0 5.0 120.00 4 \n", "2 Uttarahalli 3 BHK 1440.0 2.0 62.00 3 \n", "3 Lingadheeranahalli 3 BHK 1521.0 3.0 95.00 3 \n", "4 Kothanur 2 BHK 1200.0 2.0 51.00 2 \n", "5 Whitefield 2 BHK 1170.0 2.0 38.00 2 \n", "6 Old Airport Road 4 BHK 2732.0 4.0 204.00 4 \n", "7 Rajaji Nagar 4 BHK 3300.0 4.0 600.00 4 \n", "8 Marathahalli 3 BHK 1310.0 3.0 63.25 3 \n", "9 other 6 Bedroom 1020.0 6.0 370.00 6 \n", "\n", " price_per_sqft \n", "0 3699.810606 \n", "1 4615.384615 \n", "2 4305.555556 \n", "3 6245.890861 \n", "4 4250.000000 \n", "5 3247.863248 \n", "6 7467.057101 \n", "7 18181.818182 \n", "8 4828.244275 \n", "9 36274.509804 " ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5.head(10)" ] }, { "cell_type": "code", "execution_count": 40, "id": "8bc27536", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathpricebhkprice_per_sqft
9other6 Bedroom1020.06.0370.0636274.509804
45HSR Layout8 Bedroom600.09.0200.0833333.333333
58Murugeshpalya6 Bedroom1407.04.0150.0610660.980810
68Devarachikkanahalli8 Bedroom1350.07.085.086296.296296
70other3 Bedroom500.03.0100.0320000.000000
\n", "
" ], "text/plain": [ " location size total_sqft bath price bhk \\\n", "9 other 6 Bedroom 1020.0 6.0 370.0 6 \n", "45 HSR Layout 8 Bedroom 600.0 9.0 200.0 8 \n", "58 Murugeshpalya 6 Bedroom 1407.0 4.0 150.0 6 \n", "68 Devarachikkanahalli 8 Bedroom 1350.0 7.0 85.0 8 \n", "70 other 3 Bedroom 500.0 3.0 100.0 3 \n", "\n", " price_per_sqft \n", "9 36274.509804 \n", "45 33333.333333 \n", "58 10660.980810 \n", "68 6296.296296 \n", "70 20000.000000 " ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5[df5.total_sqft/df5.bhk<300].head()" ] }, { "cell_type": "code", "execution_count": 41, "id": "091e3fda", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(13200, 7)" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df5.shape" ] }, { "cell_type": "code", "execution_count": 42, "id": "458efc85", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(12456, 7)" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df6 = df5[~(df5.total_sqft/df5.bhk<300)]\n", "df6.shape" ] }, { "cell_type": "code", "execution_count": 43, "id": "d3da79d6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 12456.000000\n", "mean 6308.502826\n", "std 4168.127339\n", "min 267.829813\n", "25% 4210.526316\n", "50% 5294.117647\n", "75% 6916.666667\n", "max 176470.588235\n", "Name: price_per_sqft, dtype: float64" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df6.price_per_sqft.describe()" ] }, { "cell_type": "code", "execution_count": 47, "id": "fb2efbc6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(10242, 7)" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def remove_pps_outliers(df):\n", " df_out = pd.DataFrame()\n", " for key, subdf in df.groupby('location'):\n", " m = np.mean(subdf.price_per_sqft)\n", " st = np.std(subdf.price_per_sqft)\n", " reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]\n", " df_out = pd.concat([df_out,reduced_df],ignore_index=True)\n", " return df_out\n", "df7 = remove_pps_outliers(df6)\n", "df7.shape" ] }, { "cell_type": "code", "execution_count": 55, "id": "c5bba9cf", "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "matplotlib.rcParams[\"figure.figsize\"] = (20,10)\n", "def plot_scatter_chart(df,location):\n", " bhk2 = df[(df.location==location) & (df.bhk==2)]\n", " bhk3 = df[(df.location==location) & (df.bhk==3)]\n", " matplotlib.rcParams['figure.figsize'] = (15,10)\n", " plt.scatter(bhk2.total_sqft,bhk2.price,color='blue',label='2 BHK', s=50)\n", " plt.scatter(bhk3.total_sqft,bhk3.price,marker='+', color='green',label='3 BHK', s=50)\n", " plt.xlabel(\"Total Square Feet Area\")\n", " plt.ylabel(\"Price (Lakh Indian Rupees)\")\n", " plt.title(location)\n", " plt.legend()\n", " \n", "plot_scatter_chart(df7,\"Rajaji Nagar\")" ] }, { "cell_type": "code", "execution_count": 56, "id": "013b9b86", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(7317, 7)" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def remove_bhk_outliers(df):\n", " exclude_indices = np.array([])\n", " for location, location_df in df.groupby('location'):\n", " bhk_stats = {}\n", " for bhk, bhk_df in location_df.groupby('bhk'):\n", " bhk_stats[bhk] = {\n", " 'mean': np.mean(bhk_df.price_per_sqft),\n", " 'std': np.std(bhk_df.price_per_sqft),\n", " 'count': bhk_df.shape[0]\n", " }\n", " for bhk, bhk_df in location_df.groupby('bhk'):\n", " stats = bhk_stats.get(bhk-1)\n", " if stats and stats['count']>5:\n", " exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)\n", " return df.drop(exclude_indices,axis='index')\n", "df8 = remove_bhk_outliers(df7)\n", "# df8 = df7.copy()\n", "df8.shape" ] }, { "cell_type": "code", "execution_count": 57, "id": "534f884f", "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plot_scatter_chart(df8,\"Rajaji Nagar\")" ] }, { "cell_type": "code", "execution_count": 58, "id": "ee8e044e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0, 0.5, 'Count')" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib\n", "matplotlib.rcParams[\"figure.figsize\"] = (20,10)\n", "plt.hist(df8.price_per_sqft,rwidth=0.8)\n", "plt.xlabel(\"Price Per Square Feet\")\n", "plt.ylabel(\"Count\")" ] }, { "cell_type": "code", "execution_count": 59, "id": "a26184e1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 4., 3., 2., 5., 8., 1., 6., 7., 9., 12., 16., 13.])" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df8.bath.unique()" ] }, { "cell_type": "code", "execution_count": 60, "id": "38bc530e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0, 0.5, 'Count')" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.hist(df8.bath,rwidth=0.8)\n", "plt.xlabel(\"Number of bathrooms\")\n", "plt.ylabel(\"Count\")" ] }, { "cell_type": "code", "execution_count": 61, "id": "65fd495f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathpricebhkprice_per_sqft
5277Neeladri Nagar10 BHK4000.012.0160.0104000.000000
8483other10 BHK12000.012.0525.0104375.000000
8572other16 BHK10000.016.0550.0165500.000000
9306other11 BHK6000.012.0150.0112500.000000
9637other13 BHK5425.013.0275.0135069.124424
\n", "
" ], "text/plain": [ " location size total_sqft bath price bhk price_per_sqft\n", "5277 Neeladri Nagar 10 BHK 4000.0 12.0 160.0 10 4000.000000\n", "8483 other 10 BHK 12000.0 12.0 525.0 10 4375.000000\n", "8572 other 16 BHK 10000.0 16.0 550.0 16 5500.000000\n", "9306 other 11 BHK 6000.0 12.0 150.0 11 2500.000000\n", "9637 other 13 BHK 5425.0 13.0 275.0 13 5069.124424" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df8[df8.bath>10]" ] }, { "cell_type": "code", "execution_count": 62, "id": "07bd7517", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathpricebhkprice_per_sqft
1626Chikkabanavar4 Bedroom2460.07.080.043252.032520
5238Nagasandra4 Bedroom7000.08.0450.046428.571429
6711Thanisandra3 BHK1806.06.0116.036423.034330
8408other6 BHK11338.09.01000.068819.897689
\n", "
" ], "text/plain": [ " location size total_sqft bath price bhk price_per_sqft\n", "1626 Chikkabanavar 4 Bedroom 2460.0 7.0 80.0 4 3252.032520\n", "5238 Nagasandra 4 Bedroom 7000.0 8.0 450.0 4 6428.571429\n", "6711 Thanisandra 3 BHK 1806.0 6.0 116.0 3 6423.034330\n", "8408 other 6 BHK 11338.0 9.0 1000.0 6 8819.897689" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df8[df8.bath>df8.bhk+2]" ] }, { "cell_type": "code", "execution_count": 63, "id": "0b16203c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(7239, 7)" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df9 = df8[df8.bath\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationsizetotal_sqftbathpricebhkprice_per_sqft
01st Block Jayanagar4 BHK2850.04.0428.0415017.543860
11st Block Jayanagar3 BHK1630.03.0194.0311901.840491
\n", "" ], "text/plain": [ " location size total_sqft bath price bhk price_per_sqft\n", "0 1st Block Jayanagar 4 BHK 2850.0 4.0 428.0 4 15017.543860\n", "1 1st Block Jayanagar 3 BHK 1630.0 3.0 194.0 3 11901.840491" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df9.head(2)" ] }, { "cell_type": "code", "execution_count": 65, "id": "d61cfae3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationtotal_sqftbathpricebhk
01st Block Jayanagar2850.04.0428.04
11st Block Jayanagar1630.03.0194.03
21st Block Jayanagar1875.02.0235.03
\n", "
" ], "text/plain": [ " location total_sqft bath price bhk\n", "0 1st Block Jayanagar 2850.0 4.0 428.0 4\n", "1 1st Block Jayanagar 1630.0 3.0 194.0 3\n", "2 1st Block Jayanagar 1875.0 2.0 235.0 3" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df10 = df9.drop(['size','price_per_sqft'],axis='columns')\n", "df10.head(3)" ] }, { "cell_type": "code", "execution_count": 66, "id": "7da53193", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1st Block Jayanagar1st Phase JP Nagar2nd Phase Judicial Layout2nd Stage Nagarbhavi5th Block Hbr Layout5th Phase JP Nagar6th Phase JP Nagar7th Phase JP Nagar8th Phase JP Nagar9th Phase JP Nagar...Vishveshwarya LayoutVishwapriya LayoutVittasandraWhitefieldYelachenahalliYelahankaYelahanka New TownYelenahalliYeshwanthpurother
01000000000...0000000000
11000000000...0000000000
21000000000...0000000000
\n", "

3 rows × 241 columns

\n", "
" ], "text/plain": [ " 1st Block Jayanagar 1st Phase JP Nagar 2nd Phase Judicial Layout \\\n", "0 1 0 0 \n", "1 1 0 0 \n", "2 1 0 0 \n", "\n", " 2nd Stage Nagarbhavi 5th Block Hbr Layout 5th Phase JP Nagar \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "\n", " 6th Phase JP Nagar 7th Phase JP Nagar 8th Phase JP Nagar \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "\n", " 9th Phase JP Nagar ... Vishveshwarya Layout Vishwapriya Layout \\\n", "0 0 ... 0 0 \n", "1 0 ... 0 0 \n", "2 0 ... 0 0 \n", "\n", " Vittasandra Whitefield Yelachenahalli Yelahanka Yelahanka New Town \\\n", "0 0 0 0 0 0 \n", "1 0 0 0 0 0 \n", "2 0 0 0 0 0 \n", "\n", " Yelenahalli Yeshwanthpur other \n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "\n", "[3 rows x 241 columns]" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dummies = pd.get_dummies(df10.location)\n", "dummies.head(3)" ] }, { "cell_type": "code", "execution_count": 67, "id": "f6b6d2bf", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationtotal_sqftbathpricebhk1st Block Jayanagar1st Phase JP Nagar2nd Phase Judicial Layout2nd Stage Nagarbhavi5th Block Hbr Layout...VijayanagarVishveshwarya LayoutVishwapriya LayoutVittasandraWhitefieldYelachenahalliYelahankaYelahanka New TownYelenahalliYeshwanthpur
01st Block Jayanagar2850.04.0428.0410000...0000000000
11st Block Jayanagar1630.03.0194.0310000...0000000000
21st Block Jayanagar1875.02.0235.0310000...0000000000
31st Block Jayanagar1200.02.0130.0310000...0000000000
41st Block Jayanagar1235.02.0148.0210000...0000000000
\n", "

5 rows × 245 columns

\n", "
" ], "text/plain": [ " location total_sqft bath price bhk 1st Block Jayanagar \\\n", "0 1st Block Jayanagar 2850.0 4.0 428.0 4 1 \n", "1 1st Block Jayanagar 1630.0 3.0 194.0 3 1 \n", "2 1st Block Jayanagar 1875.0 2.0 235.0 3 1 \n", "3 1st Block Jayanagar 1200.0 2.0 130.0 3 1 \n", "4 1st Block Jayanagar 1235.0 2.0 148.0 2 1 \n", "\n", " 1st Phase JP Nagar 2nd Phase Judicial Layout 2nd Stage Nagarbhavi \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", " 5th Block Hbr Layout ... Vijayanagar Vishveshwarya Layout \\\n", "0 0 ... 0 0 \n", "1 0 ... 0 0 \n", "2 0 ... 0 0 \n", "3 0 ... 0 0 \n", "4 0 ... 0 0 \n", "\n", " Vishwapriya Layout Vittasandra Whitefield Yelachenahalli Yelahanka \\\n", "0 0 0 0 0 0 \n", "1 0 0 0 0 0 \n", "2 0 0 0 0 0 \n", "3 0 0 0 0 0 \n", "4 0 0 0 0 0 \n", "\n", " Yelahanka New Town Yelenahalli Yeshwanthpur \n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", "[5 rows x 245 columns]" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df11 = pd.concat([df10,dummies.drop('other',axis='columns')],axis='columns')\n", "df11.head()" ] }, { "cell_type": "code", "execution_count": 68, "id": "b57187c1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
total_sqftbathpricebhk1st Block Jayanagar1st Phase JP Nagar2nd Phase Judicial Layout2nd Stage Nagarbhavi5th Block Hbr Layout5th Phase JP Nagar...VijayanagarVishveshwarya LayoutVishwapriya LayoutVittasandraWhitefieldYelachenahalliYelahankaYelahanka New TownYelenahalliYeshwanthpur
02850.04.0428.04100000...0000000000
11630.03.0194.03100000...0000000000
\n", "

2 rows × 244 columns

\n", "
" ], "text/plain": [ " total_sqft bath price bhk 1st Block Jayanagar 1st Phase JP Nagar \\\n", "0 2850.0 4.0 428.0 4 1 0 \n", "1 1630.0 3.0 194.0 3 1 0 \n", "\n", " 2nd Phase Judicial Layout 2nd Stage Nagarbhavi 5th Block Hbr Layout \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "\n", " 5th Phase JP Nagar ... Vijayanagar Vishveshwarya Layout \\\n", "0 0 ... 0 0 \n", "1 0 ... 0 0 \n", "\n", " Vishwapriya Layout Vittasandra Whitefield Yelachenahalli Yelahanka \\\n", "0 0 0 0 0 0 \n", "1 0 0 0 0 0 \n", "\n", " Yelahanka New Town Yelenahalli Yeshwanthpur \n", "0 0 0 0 \n", "1 0 0 0 \n", "\n", "[2 rows x 244 columns]" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df12 = df11.drop('location',axis='columns')\n", "df12.head(2)" ] }, { "cell_type": "code", "execution_count": 69, "id": "954b49f6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(7239, 244)" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df12.shape" ] }, { "cell_type": "code", "execution_count": 70, "id": "e4835adf", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
total_sqftbathbhk1st Block Jayanagar1st Phase JP Nagar2nd Phase Judicial Layout2nd Stage Nagarbhavi5th Block Hbr Layout5th Phase JP Nagar6th Phase JP Nagar...VijayanagarVishveshwarya LayoutVishwapriya LayoutVittasandraWhitefieldYelachenahalliYelahankaYelahanka New TownYelenahalliYeshwanthpur
02850.04.041000000...0000000000
11630.03.031000000...0000000000
21875.02.031000000...0000000000
\n", "

3 rows × 243 columns

\n", "
" ], "text/plain": [ " total_sqft bath bhk 1st Block Jayanagar 1st Phase JP Nagar \\\n", "0 2850.0 4.0 4 1 0 \n", "1 1630.0 3.0 3 1 0 \n", "2 1875.0 2.0 3 1 0 \n", "\n", " 2nd Phase Judicial Layout 2nd Stage Nagarbhavi 5th Block Hbr Layout \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "\n", " 5th Phase JP Nagar 6th Phase JP Nagar ... Vijayanagar \\\n", "0 0 0 ... 0 \n", "1 0 0 ... 0 \n", "2 0 0 ... 0 \n", "\n", " Vishveshwarya Layout Vishwapriya Layout Vittasandra Whitefield \\\n", "0 0 0 0 0 \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "\n", " Yelachenahalli Yelahanka Yelahanka New Town Yelenahalli Yeshwanthpur \n", "0 0 0 0 0 0 \n", "1 0 0 0 0 0 \n", "2 0 0 0 0 0 \n", "\n", "[3 rows x 243 columns]" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = df12.drop(['price'],axis='columns')\n", "X.head(3)" ] }, { "cell_type": "code", "execution_count": 71, "id": "f803bc44", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(7239, 243)" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.shape" ] }, { "cell_type": "code", "execution_count": 72, "id": "3f722430", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 428.0\n", "1 194.0\n", "2 235.0\n", "Name: price, dtype: float64" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y = df12.price\n", "y.head(3)" ] }, { "cell_type": "code", "execution_count": 73, "id": "3d6fddb3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "7239" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(y)" ] }, { "cell_type": "code", "execution_count": 82, "id": "26df7c7c", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.8,random_state=13)" ] }, { "cell_type": "code", "execution_count": 83, "id": "ca73765a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8588660477968386" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.linear_model import LinearRegression\n", "lr_clf = LinearRegression()\n", "lr_clf.fit(X_train,y_train)\n", "lr_clf.score(X_test,y_test)" ] }, { "cell_type": "code", "execution_count": 86, "id": "ee5f5b93", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.82702546, 0.86027005, 0.85322178, 0.8436466 , 0.85481502])" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.model_selection import ShuffleSplit\n", "from sklearn.model_selection import cross_val_score\n", "cv=ShuffleSplit(n_splits=5,test_size=0.2,random_state=0)\n", "cross_val_score(LinearRegression(),X,y,cv=cv)" ] }, { "cell_type": "code", "execution_count": 90, "id": "41bd8610", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelbest_scorebest_params
0linear_regression0.847796{'normalize': False}
1lasso0.726861{'alpha': 2, 'selection': 'random'}
2decision_tree0.718810{'criterion': 'mse', 'splitter': 'random'}
\n", "
" ], "text/plain": [ " model best_score best_params\n", "0 linear_regression 0.847796 {'normalize': False}\n", "1 lasso 0.726861 {'alpha': 2, 'selection': 'random'}\n", "2 decision_tree 0.718810 {'criterion': 'mse', 'splitter': 'random'}" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.model_selection import GridSearchCV\n", "from sklearn.linear_model import Lasso\n", "from sklearn.tree import DecisionTreeRegressor\n", "def find_best_model_using_gridsearchcv(X,y):\n", " algos = {\n", " 'linear_regression' : {\n", " 'model': LinearRegression(),\n", " 'params': {\n", " 'normalize': [True, False]\n", " }\n", " },\n", " 'lasso': {\n", " 'model': Lasso(),\n", " 'params': {\n", " 'alpha': [1,2],\n", " 'selection': ['random', 'cyclic']\n", " }\n", " },\n", " 'decision_tree': {\n", " 'model': DecisionTreeRegressor(),\n", " 'params': {\n", " 'criterion' : ['mse','friedman_mse'],\n", " 'splitter': ['best','random']\n", " }\n", " }\n", " }\n", " scores = []\n", " cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)\n", " for algo_name, config in algos.items():\n", " gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)\n", " gs.fit(X,y)\n", " scores.append({\n", " 'model': algo_name,\n", " 'best_score': gs.best_score_,\n", " 'best_params': gs.best_params_\n", " })\n", "\n", " return pd.DataFrame(scores,columns=['model','best_score','best_params'])\n", "\n", "find_best_model_using_gridsearchcv(X,y)\n" ] }, { "cell_type": "code", "execution_count": 91, "id": "1e2267ce", "metadata": {}, "outputs": [], "source": [ "def predict_price(location,sqft,bath,bhk): \n", " loc_index = np.where(X.columns==location)[0][0]\n", "\n", " x = np.zeros(len(X.columns))\n", " x[0] = sqft\n", " x[1] = bath\n", " x[2] = bhk\n", " if loc_index >= 0:\n", " x[loc_index] = 1\n", "\n", " return lr_clf.predict([x])[0]" ] }, { "cell_type": "code", "execution_count": 92, "id": "92132555", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "90.56832868368073" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predict_price('1st Phase JP Nagar',1000, 2, 2)" ] }, { "cell_type": "code", "execution_count": 93, "id": "93e6427d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "93.52384203792523" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predict_price('1st Phase JP Nagar',1000, 3, 3)" ] }, { "cell_type": "code", "execution_count": 94, "id": "d2c3bee7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "184.177406050691" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predict_price('Indira Nagar',1000, 3, 3)" ] }, { "cell_type": "code", "execution_count": 95, "id": "afae9289", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "181.2218926964465" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predict_price('Indira Nagar',1000, 2, 2)" ] }, { "cell_type": "code", "execution_count": 97, "id": "c3eee16b", "metadata": {}, "outputs": [], "source": [ "import pickle\n", "with open('banglore_home_prices_model.pickle','wb') as f:\n", " pickle.dump(lr_clf,f)" ] }, { "cell_type": "code", "execution_count": 99, "id": "f5757ab7", "metadata": {}, "outputs": [], "source": [ "import json\n", "columns={\n", " 'data_columns':[col.lower() for col in X.columns]\n", "}\n", "with open('columns.json','w') as f:\n", " f.write(json.dumps(columns))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 5 }