Spaces:
Sleeping
Sleeping
prakrati-m
commited on
Commit
•
15cf65d
1
Parent(s):
5ddb994
Upload 5 files
Browse files- .gitattributes +1 -0
- app.py +189 -0
- requirements.txt +8 -0
- test.csv +0 -0
- train.csv +3 -0
- veh.jpeg +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
train.csv filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import seaborn as sns
|
5 |
+
from PIL import Image
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
from sklearn.linear_model import LinearRegression
|
9 |
+
from sklearn import metrics
|
10 |
+
|
11 |
+
|
12 |
+
st.sidebar.header("Dashboard")
|
13 |
+
st.sidebar.markdown("---")
|
14 |
+
app_mode = st.sidebar.selectbox('Select Page',['Introduction','Visualization','Prediction'])
|
15 |
+
|
16 |
+
df = pd.read_csv("train.csv")
|
17 |
+
|
18 |
+
|
19 |
+
if app_mode == "Introduction":
|
20 |
+
|
21 |
+
st.image("veh.jpeg", use_column_width=True)
|
22 |
+
st.title("Introduction")
|
23 |
+
st.markdown("#### What is the relation between Health and Vehicle insurance?")
|
24 |
+
|
25 |
+
st.markdown("##### Objectives")
|
26 |
+
st.markdown("We aim to see what factors contributes to the purchase of Vehicle insurance.")
|
27 |
+
|
28 |
+
st.markdown("Welcome to our Health Insurance Cross-Sell Dashboard!")
|
29 |
+
st.markdown("Did you know? Globally, approximately 55% of the population lacks access to essential health services.")
|
30 |
+
st.markdown("Furthermore, only 18% of the world's population has access to social security benefits that include health insurance coverage.")
|
31 |
+
st.markdown("Our dashboard aims to bridge this gap by providing personalized recommendations and insights to help individuals make informed decisions about their insurance needs.")
|
32 |
+
|
33 |
+
num = st.number_input('No. of Rows', 5, 10)
|
34 |
+
|
35 |
+
head = st.radio('View from top (head) or bottom (tail)', ('Head', 'Tail'))
|
36 |
+
if head == 'Head':
|
37 |
+
st.dataframe(df.head(num))
|
38 |
+
else:
|
39 |
+
st.dataframe(df.tail(num))
|
40 |
+
|
41 |
+
st.text('(Rows,Columns)')
|
42 |
+
st.write(df.shape)
|
43 |
+
|
44 |
+
st.markdown("##### Key Variables")
|
45 |
+
st.markdown("- Gender of the customer")
|
46 |
+
st.markdown("- Age of the customer")
|
47 |
+
st.markdown("- Does the customer possess a Driving License")
|
48 |
+
st.markdown("- Region of the customer")
|
49 |
+
st.markdown("- Does the customer possess a Health insurance")
|
50 |
+
st.markdown("- Age of the Vehicle")
|
51 |
+
st.markdown("- Did the customer damage vehicle in past")
|
52 |
+
st.markdown("- How much does customer pay for premium (INR)")
|
53 |
+
st.markdown("- How long has the customer been associated with the company")
|
54 |
+
|
55 |
+
st.markdown("From all these variables we wim to predict a price that the customers would be willing to pay for Vehicle Insurance.")
|
56 |
+
st.markdown("Analysing the relationships between such as 'Vehicle Damage' and 'Previously_insured' with 'Response' will help us define our target audience.")
|
57 |
+
st.markdown("Analysing relationships between 'Region' and 'Age' with 'Price' will help us define a price point.")
|
58 |
+
|
59 |
+
st.markdown("### Description of Data")
|
60 |
+
st.dataframe(df.describe())
|
61 |
+
st.markdown("Descriptions for all quantitative data **(rank and streams)** by:")
|
62 |
+
|
63 |
+
st.markdown("Count")
|
64 |
+
st.markdown("Mean")
|
65 |
+
st.markdown("Standard Deviation")
|
66 |
+
st.markdown("Minimum")
|
67 |
+
st.markdown("Quartiles")
|
68 |
+
st.markdown("Maximum")
|
69 |
+
|
70 |
+
st.markdown("### Missing Values")
|
71 |
+
st.markdown("Null or NaN values.")
|
72 |
+
|
73 |
+
dfnull = df.isnull().sum()/len(df)*100
|
74 |
+
totalmiss = dfnull.sum().round(2)
|
75 |
+
st.write("Percentage of total missing values:",totalmiss)
|
76 |
+
st.write(dfnull)
|
77 |
+
if totalmiss <= 30:
|
78 |
+
st.success("We have less then 30 percent of missing values, which is good. This provides us with more accurate data as the null values will not significantly affect the outcomes of our conclusions. And no bias will steer towards misleading results. ")
|
79 |
+
else:
|
80 |
+
st.warning("Poor data quality due to greater than 30 percent of missing value.")
|
81 |
+
st.markdown(" > Theoretically, 25 to 30 percent is the maximum missing values are allowed, there's no hard and fast rule to decide this threshold. It can vary from problem to problem.")
|
82 |
+
|
83 |
+
st.markdown("### Completeness")
|
84 |
+
st.markdown(" The ratio of non-missing values to total records in dataset and how comprehensive the data is.")
|
85 |
+
|
86 |
+
st.write("Total data length:", len(df))
|
87 |
+
nonmissing = (df.notnull().sum().round(2))
|
88 |
+
completeness= round(sum(nonmissing)/len(df),2)
|
89 |
+
|
90 |
+
st.write("Completeness ratio:",completeness)
|
91 |
+
st.write(nonmissing)
|
92 |
+
if completeness >= 0.80:
|
93 |
+
st.success("We have completeness ratio greater than 0.85, which is good. It shows that the vast majority of the data is available for us to use and analyze. ")
|
94 |
+
else:
|
95 |
+
st.success("Poor data quality due to low completeness ratio( less than 0.85).")
|
96 |
+
|
97 |
+
elif app_mode == "Visualization":
|
98 |
+
st.title("Visualization")
|
99 |
+
|
100 |
+
# DATA TRIMMING
|
101 |
+
# Changing "Yes" and "No" to 1 and 0
|
102 |
+
df.loc[df['Vehicle_Damage'] == "Yes", 'Vehicle_Damage'] = 1
|
103 |
+
df.loc[df['Vehicle_Damage'] == "No", 'Vehicle_Damage'] = 0
|
104 |
+
|
105 |
+
# Deleting "Policy_Sales_Channel" column
|
106 |
+
del df['Policy_Sales_Channel']
|
107 |
+
|
108 |
+
# DATA VISUALISATION
|
109 |
+
|
110 |
+
tab1, tab2, tab3, tab4 = st.tabs(["SNS Plot", "Bar Chart", "Line Chart", "Pie Plot"])
|
111 |
+
|
112 |
+
#SNS plot
|
113 |
+
tab1.subheader("SNS plot")
|
114 |
+
sampled_df = df.sample(n=1000)
|
115 |
+
fig = sns.pairplot(sampled_df)
|
116 |
+
tab1.pyplot(fig)
|
117 |
+
|
118 |
+
#Bar Graph
|
119 |
+
# User input for x-variable
|
120 |
+
columns = ['Region_Code', 'Gender', 'Vehicle_Age']
|
121 |
+
x_variable = tab2.selectbox("Select x-variable:", columns)
|
122 |
+
tab2.subheader(f"{x_variable} vs Price (INR)")
|
123 |
+
data_by_variable = df.groupby(x_variable)['Annual_Premium'].mean()
|
124 |
+
tab2.bar_chart(data_by_variable)
|
125 |
+
|
126 |
+
#Line Graph
|
127 |
+
tab3.subheader("Age vs Price")
|
128 |
+
age_by_price = df.groupby('Age')['Annual_Premium'].mean()
|
129 |
+
tab3.line_chart(age_by_price)
|
130 |
+
|
131 |
+
#Pie Plot
|
132 |
+
tab4.subheader("Response distribution by Vehicle Damage")
|
133 |
+
response_counts = df.groupby(['Vehicle_Damage', 'Response']).size().unstack(fill_value=0)
|
134 |
+
fig, ax = plt.subplots()
|
135 |
+
colors = ['#ff9999','#66b3ff']
|
136 |
+
damage_counts = response_counts.loc[1]
|
137 |
+
percentages = (damage_counts.values / damage_counts.sum()) * 100
|
138 |
+
labels = ['Yes', 'No']
|
139 |
+
ax.pie(percentages, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
|
140 |
+
ax.axis('equal')
|
141 |
+
tab4.pyplot(fig)
|
142 |
+
|
143 |
+
#Pie Plot2
|
144 |
+
tab4.subheader("Response Distribution by Not Previously Insured")
|
145 |
+
response_counts = df.groupby(['Previously_Insured', 'Response']).size().unstack(fill_value=0)
|
146 |
+
fig, ax = plt.subplots()
|
147 |
+
colors = ['#ff9999','#66b3ff']
|
148 |
+
prev_insurance_counts = response_counts.loc[0]
|
149 |
+
percentages = (prev_insurance_counts.values / prev_insurance_counts.sum()) * 100
|
150 |
+
labels = ['Yes', 'No']
|
151 |
+
ax.pie(percentages, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
|
152 |
+
ax.axis('equal')
|
153 |
+
tab4.pyplot(fig)
|
154 |
+
|
155 |
+
|
156 |
+
|
157 |
+
elif app_mode == "Prediction":
|
158 |
+
# Changing "Yes" and "No" to 1 and 0
|
159 |
+
df.loc[df['Vehicle_Damage'] == "Yes", 'Vehicle_Damage'] = 1
|
160 |
+
df.loc[df['Vehicle_Damage'] == "No", 'Vehicle_Damage'] = 0
|
161 |
+
st.title("Prediction")
|
162 |
+
X = df[['Age', 'Region_Code', 'Driving_License','Vehicle_Damage', 'Previously_Insured']]
|
163 |
+
y = df['Annual_Premium']
|
164 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
165 |
+
lin_reg = LinearRegression()
|
166 |
+
lin_reg.fit(X_train,y_train)
|
167 |
+
pred = lin_reg.predict(X_test)
|
168 |
+
|
169 |
+
plt.figure(figsize=(10,7))
|
170 |
+
plt.title("Actual vs. predicted Annual Premiums",fontsize=25)
|
171 |
+
plt.xlabel("Actual test set Annual Premiums",fontsize=18)
|
172 |
+
plt.ylabel("Predicted Annual Premiums", fontsize=18)
|
173 |
+
plt.scatter(x=y_test,y=pred)
|
174 |
+
plt.savefig('prediction.png')
|
175 |
+
st.image('prediction.png')
|
176 |
+
|
177 |
+
# Model Evaluation
|
178 |
+
st.markdown("Evaluation")
|
179 |
+
coeff_df = pd.DataFrame(lin_reg.coef_, X.columns, columns=['Coefficient'])
|
180 |
+
st.dataframe(coeff_df)
|
181 |
+
MAE = metrics.mean_absolute_error(y_test, pred)
|
182 |
+
MSE = metrics.mean_squared_error(y_test, pred)
|
183 |
+
RMSE = np.sqrt(metrics.mean_squared_error(y_test, pred))
|
184 |
+
st.write('MAE:', MAE)
|
185 |
+
st.write('MSE:', MSE)
|
186 |
+
st.write('RMSE:', RMSE)
|
187 |
+
|
188 |
+
|
189 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
matplotlib
|
3 |
+
numpy
|
4 |
+
pandas
|
5 |
+
Pillow
|
6 |
+
sweetviz
|
7 |
+
scikit-learn
|
8 |
+
seaborn
|
test.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
train.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b60d1072851756367817fb155e6e755ffa79509daa806720719bc395dfbea7d
|
3 |
+
size 21432357
|
veh.jpeg
ADDED