{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"url = 'https://raw.githubusercontent.com/nikhil-xyz/datasets/main/insurance.csv'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(url)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1338, 7)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" age | \n",
" sex | \n",
" bmi | \n",
" children | \n",
" smoker | \n",
" region | \n",
" expenses | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 19 | \n",
" female | \n",
" 27.9 | \n",
" 0 | \n",
" yes | \n",
" southwest | \n",
" 16884.92 | \n",
"
\n",
" \n",
" 1 | \n",
" 18 | \n",
" male | \n",
" 33.8 | \n",
" 1 | \n",
" no | \n",
" southeast | \n",
" 1725.55 | \n",
"
\n",
" \n",
" 2 | \n",
" 28 | \n",
" male | \n",
" 33.0 | \n",
" 3 | \n",
" no | \n",
" southeast | \n",
" 4449.46 | \n",
"
\n",
" \n",
" 3 | \n",
" 33 | \n",
" male | \n",
" 22.7 | \n",
" 0 | \n",
" no | \n",
" northwest | \n",
" 21984.47 | \n",
"
\n",
" \n",
" 4 | \n",
" 32 | \n",
" male | \n",
" 28.9 | \n",
" 0 | \n",
" no | \n",
" northwest | \n",
" 3866.86 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" age sex bmi children smoker region expenses\n",
"0 19 female 27.9 0 yes southwest 16884.92\n",
"1 18 male 33.8 1 no southeast 1725.55\n",
"2 28 male 33.0 3 no southeast 4449.46\n",
"3 33 male 22.7 0 no northwest 21984.47\n",
"4 32 male 28.9 0 no northwest 3866.86"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"age 0\n",
"sex 0\n",
"bmi 0\n",
"children 0\n",
"smoker 0\n",
"region 0\n",
"expenses 0\n",
"dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.duplicated().sum()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"data.drop_duplicates(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.duplicated().sum()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Index: 1337 entries, 0 to 1337\n",
"Data columns (total 7 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 age 1337 non-null int64 \n",
" 1 sex 1337 non-null object \n",
" 2 bmi 1337 non-null float64\n",
" 3 children 1337 non-null int64 \n",
" 4 smoker 1337 non-null object \n",
" 5 region 1337 non-null object \n",
" 6 expenses 1337 non-null float64\n",
"dtypes: float64(2), int64(2), object(3)\n",
"memory usage: 83.6+ KB\n"
]
}
],
"source": [
"data.info()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" age | \n",
" bmi | \n",
" children | \n",
" expenses | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 1337.000000 | \n",
" 1337.000000 | \n",
" 1337.000000 | \n",
" 1337.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 39.222139 | \n",
" 30.665520 | \n",
" 1.095737 | \n",
" 13279.121638 | \n",
"
\n",
" \n",
" std | \n",
" 14.044333 | \n",
" 6.100664 | \n",
" 1.205571 | \n",
" 12110.359657 | \n",
"
\n",
" \n",
" min | \n",
" 18.000000 | \n",
" 16.000000 | \n",
" 0.000000 | \n",
" 1121.870000 | \n",
"
\n",
" \n",
" 25% | \n",
" 27.000000 | \n",
" 26.300000 | \n",
" 0.000000 | \n",
" 4746.340000 | \n",
"
\n",
" \n",
" 50% | \n",
" 39.000000 | \n",
" 30.400000 | \n",
" 1.000000 | \n",
" 9386.160000 | \n",
"
\n",
" \n",
" 75% | \n",
" 51.000000 | \n",
" 34.700000 | \n",
" 2.000000 | \n",
" 16657.720000 | \n",
"
\n",
" \n",
" max | \n",
" 64.000000 | \n",
" 53.100000 | \n",
" 5.000000 | \n",
" 63770.430000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" age bmi children expenses\n",
"count 1337.000000 1337.000000 1337.000000 1337.000000\n",
"mean 39.222139 30.665520 1.095737 13279.121638\n",
"std 14.044333 6.100664 1.205571 12110.359657\n",
"min 18.000000 16.000000 0.000000 1121.870000\n",
"25% 27.000000 26.300000 0.000000 4746.340000\n",
"50% 39.000000 30.400000 1.000000 9386.160000\n",
"75% 51.000000 34.700000 2.000000 16657.720000\n",
"max 64.000000 53.100000 5.000000 63770.430000"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- all candidated have age between 18 and 64\n",
"- 50% of candidated either have one kid or no kid"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**categories inside the categorical data**"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Categories in 'sex' variable: ['female' 'male']\n",
"Categories in 'smoker' variable: ['yes' 'no']\n",
"Categories in 'region' variable: ['southwest' 'southeast' 'northwest' 'northeast']\n"
]
}
],
"source": [
"print(\"Categories in 'sex' variable: \",end=\" \" )\n",
"print(data['sex'].unique())\n",
"\n",
"print(\"Categories in 'smoker' variable: \",end=\" \" )\n",
"print(data['smoker'].unique())\n",
"\n",
"print(\"Categories in 'region' variable: \",end=\" \" )\n",
"print(data['region'].unique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.19"
}
},
"nbformat": 4,
"nbformat_minor": 2
}