prakrati-m commited on
Commit
15cf65d
1 Parent(s): 5ddb994

Upload 5 files

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. app.py +189 -0
  3. requirements.txt +8 -0
  4. test.csv +0 -0
  5. train.csv +3 -0
  6. veh.jpeg +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ train.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import seaborn as sns
5
+ from PIL import Image
6
+ import matplotlib.pyplot as plt
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.linear_model import LinearRegression
9
+ from sklearn import metrics
10
+
11
+
12
+ st.sidebar.header("Dashboard")
13
+ st.sidebar.markdown("---")
14
+ app_mode = st.sidebar.selectbox('Select Page',['Introduction','Visualization','Prediction'])
15
+
16
+ df = pd.read_csv("train.csv")
17
+
18
+
19
+ if app_mode == "Introduction":
20
+
21
+ st.image("veh.jpeg", use_column_width=True)
22
+ st.title("Introduction")
23
+ st.markdown("#### What is the relation between Health and Vehicle insurance?")
24
+
25
+ st.markdown("##### Objectives")
26
+ st.markdown("We aim to see what factors contributes to the purchase of Vehicle insurance.")
27
+
28
+ st.markdown("Welcome to our Health Insurance Cross-Sell Dashboard!")
29
+ st.markdown("Did you know? Globally, approximately 55% of the population lacks access to essential health services.")
30
+ st.markdown("Furthermore, only 18% of the world's population has access to social security benefits that include health insurance coverage.")
31
+ st.markdown("Our dashboard aims to bridge this gap by providing personalized recommendations and insights to help individuals make informed decisions about their insurance needs.")
32
+
33
+ num = st.number_input('No. of Rows', 5, 10)
34
+
35
+ head = st.radio('View from top (head) or bottom (tail)', ('Head', 'Tail'))
36
+ if head == 'Head':
37
+ st.dataframe(df.head(num))
38
+ else:
39
+ st.dataframe(df.tail(num))
40
+
41
+ st.text('(Rows,Columns)')
42
+ st.write(df.shape)
43
+
44
+ st.markdown("##### Key Variables")
45
+ st.markdown("- Gender of the customer")
46
+ st.markdown("- Age of the customer")
47
+ st.markdown("- Does the customer possess a Driving License")
48
+ st.markdown("- Region of the customer")
49
+ st.markdown("- Does the customer possess a Health insurance")
50
+ st.markdown("- Age of the Vehicle")
51
+ st.markdown("- Did the customer damage vehicle in past")
52
+ st.markdown("- How much does customer pay for premium (INR)")
53
+ st.markdown("- How long has the customer been associated with the company")
54
+
55
+ st.markdown("From all these variables we wim to predict a price that the customers would be willing to pay for Vehicle Insurance.")
56
+ st.markdown("Analysing the relationships between such as 'Vehicle Damage' and 'Previously_insured' with 'Response' will help us define our target audience.")
57
+ st.markdown("Analysing relationships between 'Region' and 'Age' with 'Price' will help us define a price point.")
58
+
59
+ st.markdown("### Description of Data")
60
+ st.dataframe(df.describe())
61
+ st.markdown("Descriptions for all quantitative data **(rank and streams)** by:")
62
+
63
+ st.markdown("Count")
64
+ st.markdown("Mean")
65
+ st.markdown("Standard Deviation")
66
+ st.markdown("Minimum")
67
+ st.markdown("Quartiles")
68
+ st.markdown("Maximum")
69
+
70
+ st.markdown("### Missing Values")
71
+ st.markdown("Null or NaN values.")
72
+
73
+ dfnull = df.isnull().sum()/len(df)*100
74
+ totalmiss = dfnull.sum().round(2)
75
+ st.write("Percentage of total missing values:",totalmiss)
76
+ st.write(dfnull)
77
+ if totalmiss <= 30:
78
+ st.success("We have less then 30 percent of missing values, which is good. This provides us with more accurate data as the null values will not significantly affect the outcomes of our conclusions. And no bias will steer towards misleading results. ")
79
+ else:
80
+ st.warning("Poor data quality due to greater than 30 percent of missing value.")
81
+ st.markdown(" > Theoretically, 25 to 30 percent is the maximum missing values are allowed, there's no hard and fast rule to decide this threshold. It can vary from problem to problem.")
82
+
83
+ st.markdown("### Completeness")
84
+ st.markdown(" The ratio of non-missing values to total records in dataset and how comprehensive the data is.")
85
+
86
+ st.write("Total data length:", len(df))
87
+ nonmissing = (df.notnull().sum().round(2))
88
+ completeness= round(sum(nonmissing)/len(df),2)
89
+
90
+ st.write("Completeness ratio:",completeness)
91
+ st.write(nonmissing)
92
+ if completeness >= 0.80:
93
+ st.success("We have completeness ratio greater than 0.85, which is good. It shows that the vast majority of the data is available for us to use and analyze. ")
94
+ else:
95
+ st.success("Poor data quality due to low completeness ratio( less than 0.85).")
96
+
97
+ elif app_mode == "Visualization":
98
+ st.title("Visualization")
99
+
100
+ # DATA TRIMMING
101
+ # Changing "Yes" and "No" to 1 and 0
102
+ df.loc[df['Vehicle_Damage'] == "Yes", 'Vehicle_Damage'] = 1
103
+ df.loc[df['Vehicle_Damage'] == "No", 'Vehicle_Damage'] = 0
104
+
105
+ # Deleting "Policy_Sales_Channel" column
106
+ del df['Policy_Sales_Channel']
107
+
108
+ # DATA VISUALISATION
109
+
110
+ tab1, tab2, tab3, tab4 = st.tabs(["SNS Plot", "Bar Chart", "Line Chart", "Pie Plot"])
111
+
112
+ #SNS plot
113
+ tab1.subheader("SNS plot")
114
+ sampled_df = df.sample(n=1000)
115
+ fig = sns.pairplot(sampled_df)
116
+ tab1.pyplot(fig)
117
+
118
+ #Bar Graph
119
+ # User input for x-variable
120
+ columns = ['Region_Code', 'Gender', 'Vehicle_Age']
121
+ x_variable = tab2.selectbox("Select x-variable:", columns)
122
+ tab2.subheader(f"{x_variable} vs Price (INR)")
123
+ data_by_variable = df.groupby(x_variable)['Annual_Premium'].mean()
124
+ tab2.bar_chart(data_by_variable)
125
+
126
+ #Line Graph
127
+ tab3.subheader("Age vs Price")
128
+ age_by_price = df.groupby('Age')['Annual_Premium'].mean()
129
+ tab3.line_chart(age_by_price)
130
+
131
+ #Pie Plot
132
+ tab4.subheader("Response distribution by Vehicle Damage")
133
+ response_counts = df.groupby(['Vehicle_Damage', 'Response']).size().unstack(fill_value=0)
134
+ fig, ax = plt.subplots()
135
+ colors = ['#ff9999','#66b3ff']
136
+ damage_counts = response_counts.loc[1]
137
+ percentages = (damage_counts.values / damage_counts.sum()) * 100
138
+ labels = ['Yes', 'No']
139
+ ax.pie(percentages, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
140
+ ax.axis('equal')
141
+ tab4.pyplot(fig)
142
+
143
+ #Pie Plot2
144
+ tab4.subheader("Response Distribution by Not Previously Insured")
145
+ response_counts = df.groupby(['Previously_Insured', 'Response']).size().unstack(fill_value=0)
146
+ fig, ax = plt.subplots()
147
+ colors = ['#ff9999','#66b3ff']
148
+ prev_insurance_counts = response_counts.loc[0]
149
+ percentages = (prev_insurance_counts.values / prev_insurance_counts.sum()) * 100
150
+ labels = ['Yes', 'No']
151
+ ax.pie(percentages, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
152
+ ax.axis('equal')
153
+ tab4.pyplot(fig)
154
+
155
+
156
+
157
+ elif app_mode == "Prediction":
158
+ # Changing "Yes" and "No" to 1 and 0
159
+ df.loc[df['Vehicle_Damage'] == "Yes", 'Vehicle_Damage'] = 1
160
+ df.loc[df['Vehicle_Damage'] == "No", 'Vehicle_Damage'] = 0
161
+ st.title("Prediction")
162
+ X = df[['Age', 'Region_Code', 'Driving_License','Vehicle_Damage', 'Previously_Insured']]
163
+ y = df['Annual_Premium']
164
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
165
+ lin_reg = LinearRegression()
166
+ lin_reg.fit(X_train,y_train)
167
+ pred = lin_reg.predict(X_test)
168
+
169
+ plt.figure(figsize=(10,7))
170
+ plt.title("Actual vs. predicted Annual Premiums",fontsize=25)
171
+ plt.xlabel("Actual test set Annual Premiums",fontsize=18)
172
+ plt.ylabel("Predicted Annual Premiums", fontsize=18)
173
+ plt.scatter(x=y_test,y=pred)
174
+ plt.savefig('prediction.png')
175
+ st.image('prediction.png')
176
+
177
+ # Model Evaluation
178
+ st.markdown("Evaluation")
179
+ coeff_df = pd.DataFrame(lin_reg.coef_, X.columns, columns=['Coefficient'])
180
+ st.dataframe(coeff_df)
181
+ MAE = metrics.mean_absolute_error(y_test, pred)
182
+ MSE = metrics.mean_squared_error(y_test, pred)
183
+ RMSE = np.sqrt(metrics.mean_squared_error(y_test, pred))
184
+ st.write('MAE:', MAE)
185
+ st.write('MSE:', MSE)
186
+ st.write('RMSE:', RMSE)
187
+
188
+
189
+
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ matplotlib
3
+ numpy
4
+ pandas
5
+ Pillow
6
+ sweetviz
7
+ scikit-learn
8
+ seaborn
test.csv ADDED
The diff for this file is too large to render. See raw diff
 
train.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b60d1072851756367817fb155e6e755ffa79509daa806720719bc395dfbea7d
3
+ size 21432357
veh.jpeg ADDED