ibrahimnomad commited on
Commit
541ab20
1 Parent(s): 133e276

Upload 3 files

Browse files
Files changed (3) hide show
  1. Life Expectancy Data.csv +0 -0
  2. app.py +234 -0
  3. requirements.txt +6 -0
Life Expectancy Data.csv ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+ import plotly.graph_objects as go
4
+ import numpy as np
5
+ import seaborn as sns
6
+ from scipy.stats.mstats import winsorize
7
+ import streamlit as st
8
+ df = pd.read_csv("Life Expectancy Data.csv")
9
+
10
+ st.title('Analyzing The World :earth_africa:')
11
+ st.write('**Below data is edited for better analysis and has 2900 rows.It gives life expectancy info for every country between the years 2000-2015. We will get so see the development or regession for each coutry and the world average.**')
12
+ df.rename(columns = {" BMI " :"BMI",
13
+ "Life expectancy ": "Life_expectancy",
14
+ "Adult Mortality":"Adult_mortality",
15
+ "infant deaths":"Infant_deaths",
16
+ "percentage expenditure":"Percentage_expenditure",
17
+ "Hepatitis B":"HepatitisB",
18
+ "Measles ":"Measles",
19
+ "under-five deaths ": "Under_five_deaths",
20
+ "Total expenditure":"Total_expenditure",
21
+ "Diphtheria ": "Diphtheria",
22
+ " thinness 1-19 years":"Thinness_1-19_years",
23
+ " thinness 5-9 years":"Thinness_5-9_years",
24
+ " HIV/AIDS":"HIV/AIDS",
25
+ "Income composition of resources":"Income_composition_of_resources"}, inplace = True)
26
+
27
+ df.groupby('Country').apply(lambda group: group.interpolate(method= 'linear'))
28
+ imputed_data = []
29
+ for year in list(df.Year.unique()):
30
+ year_data = df[df.Year == year].copy()
31
+ for col in list(year_data.columns)[4:]:
32
+ year_data[col] = year_data[col].fillna(year_data[col].dropna().median()).copy()
33
+ imputed_data.append(year_data)
34
+ df = pd.concat(imputed_data).copy()
35
+ df['Life_expectancy'].fillna(df['Life_expectancy'].mean(), inplace=True)
36
+ df.reset_index(inplace=True)
37
+ df = df.drop('index', axis=1)
38
+
39
+ st.dataframe(df)
40
+
41
+ st.write('**For a better Analysis, we should also remove outliers. Lets see them first.**')
42
+ col_dict = {'Life_expectancy':1,'Adult_mortality':2,'Infant_deaths':3,'Alcohol':4,'Percentage_expenditure':5,'HepatitisB':6,'Measles':7,'BMI':8,'Under_five_deaths':9,'Polio':10,'Total_expenditure':11,'Diphtheria':12,'HIV/AIDS':13,'GDP':14,'Population':15,'Thinness_1-19_years':16,'Thinness_5-9_years':17,'Income_composition_of_resources':18,'Schooling':19}
43
+ fig = plt.figure(figsize=(20,30))
44
+ for variable, i in col_dict.items():
45
+ plt.subplot(5, 4, i)
46
+ plt.boxplot(df[variable])
47
+ plt.title(variable)
48
+ plt.grid(True)
49
+ st.pyplot(fig)
50
+
51
+ st.write("""
52
+ We'll remove outliers in Infant_Deaths, Measles, and Under_five_deaths columns since values beyond 1000 are unrealistic.
53
+
54
+ Similarly, we'll address extreme values in Expenditure_Percentage, GDP, and Population columns by taking logarithmic values.
55
+
56
+ BMI values above 40 indicate extreme obesity, and some countries have averages around 60, which is not possible. Therefore, we'll remove the entire BMI column.
57
+
58
+ For other columns with outliers, we'll apply winsorization for data normalization.
59
+ """)
60
+
61
+ # Remove outliers and log transform
62
+ df = df[df[['Infant_deaths', 'Measles', 'Under_five_deaths']].lt(1001).all(axis=1)]
63
+ df.drop('BMI', axis=1, inplace=True)
64
+ df[['Percentage_expenditure', 'Population', 'GDP']].apply(np.log)
65
+ df.replace([np.inf, -np.inf], 0, inplace=True)
66
+
67
+ # Winsorization
68
+ cols_to_winsorize = ['Life_expectancy', 'Adult_mortality', 'Alcohol', 'HepatitisB', 'Polio', 'Total_expenditure',
69
+ 'Diphtheria', 'HIV/AIDS', 'Thinness_1-19_years', 'Thinness_5-9_years',
70
+ 'Income_composition_of_resources', 'Schooling']
71
+
72
+ winz_cols = [col for col in cols_to_winsorize]
73
+ df[winz_cols] = df[cols_to_winsorize].apply(lambda x: winsorize(x, limits=((0.05, 0) if x.name == 'Life_expectancy' else
74
+ (0, 0.04) if x.name == 'Adult_mortality' else
75
+ (0.0, 0.01) if x.name == 'Alcohol' else
76
+ (0.20, 0.0) if x.name == 'HepatitisB' else
77
+ (0.20, 0.0) if x.name == 'Polio' else
78
+ (0.0, 0.02) if x.name == 'Total_expenditure' else
79
+ (0.11, 0.0) if x.name == 'Diphtheria' else
80
+ (0.0, 0.21) if x.name == 'HIV/AIDS' else
81
+ (0.0, 0.04) if x.name == 'Thinness_1-19_years' else
82
+ (0.0, 0.04) if x.name == 'Thinness_5-9_years' else
83
+ (0.05, 0.0) if x.name == 'Income_composition_of_resources' else
84
+ (0.03, 0.01)), axis=0))
85
+
86
+ # Plot boxplots for winsorized variables
87
+ fig, axs = plt.subplots(3, 6, figsize=(20, 20))
88
+ cols_to_plot = winz_cols + ['Measles', 'Infant_deaths', 'Under_five_deaths', 'GDP', 'Population', 'Percentage_expenditure']
89
+ for ax, col in zip(axs.flat, cols_to_plot):
90
+ sns.boxplot(y=df[col], ax=ax, color="green")
91
+ ax.set_title(col)
92
+ ax.set_ylabel('')
93
+ ax.grid(True)
94
+
95
+ plt.tight_layout()
96
+ st.pyplot(fig)
97
+
98
+ st.write('**Analysis**')
99
+
100
+ fig = plt.figure(figsize=(20, 20))
101
+ for i, variable in enumerate(cols_to_plot, start=1):
102
+ plt.subplot(6, 6, i)
103
+ plt.hist(df[variable])
104
+ plt.title(variable)
105
+ plt.ylabel('')
106
+ plt.grid(True)
107
+ st.pyplot(fig)
108
+
109
+ # Plot correlation heatmap
110
+ life_exp = cols_to_plot + ['Year']
111
+ plt.figure(figsize=(15, 10))
112
+ corr_matrix = df[life_exp].corr().values
113
+ st.pyplot(sns.heatmap(df[life_exp].corr(), annot=True, linewidths=4).figure)
114
+
115
+
116
+
117
+ # Get correlations
118
+ flattened_corr = corr_matrix.flatten()
119
+ sorted_corr_indices = np.argsort(flattened_corr)
120
+ top_25_pos_corr_indices = sorted_corr_indices[-70:-1]
121
+ top_25_pos_corr_indices = top_25_pos_corr_indices[::-1]
122
+ top_25_neg_corr_indices = sorted_corr_indices[:50]
123
+
124
+ # Create DataFrames for positive and negative correlations
125
+ corr_columns = df[life_exp].columns
126
+ corr_df = pd.DataFrame(columns=['1', '2', 'Correlation'])
127
+ neg_corr_df = pd.DataFrame(columns=['1', '2', 'Correlation'])
128
+
129
+ # Populate DataFrames
130
+ for idx in top_25_pos_corr_indices:
131
+ row, col = np.unravel_index(idx, corr_matrix.shape)
132
+ if row != col:
133
+ corr_df = pd.concat([corr_df, pd.DataFrame({'1': [corr_columns[row]], '2': [corr_columns[col]], 'Correlation': [corr_matrix[row, col]]})])
134
+
135
+ for idx in top_25_neg_corr_indices:
136
+ row, col = np.unravel_index(idx, corr_matrix.shape)
137
+ if row != col:
138
+ neg_corr_df = pd.concat([neg_corr_df, pd.DataFrame({'1': [corr_columns[row]], '2': [corr_columns[col]], 'Correlation': [corr_matrix[row, col]]})])
139
+
140
+ # Drop duplicates from both DataFrames
141
+ corr_df.drop_duplicates(subset=['Correlation'], inplace=True)
142
+ neg_corr_df.drop_duplicates(subset=['Correlation'], inplace=True)
143
+
144
+ # Display the top correlations
145
+ st.write("Top 25 Positive Correlations:")
146
+ st.dataframe(corr_df)
147
+
148
+ st.write("Top 25 Negative Correlations:")
149
+ st.dataframe(neg_corr_df)
150
+
151
+ st.write("""
152
+ Key insights from the correlation analysis:
153
+
154
+ - Adult mortality exhibits a negative correlation with schooling and income composition, while it positively correlates with HIV/AIDS.
155
+ - Infant deaths and under-five deaths are strongly positively correlated.
156
+ - Schooling and alcohol consumption display a positive relationship.
157
+ - Percentage expenditure shows positive correlations with schooling, income composition, GDP, and life expectancy.
158
+ - Hepatitis B is strongly positively correlated with polio and diphtheria.
159
+ - Polio and diphtheria show strong positive correlations with each other and with life expectancy.
160
+ - Life expectancy is positively correlated with schooling, income composition, GDP, diphtheria, polio, and percentage expenditure. Conversely, it is negatively correlated with adult mortality, thinness in both age ranges, HIV/AIDS, under-five deaths, and infant deaths.
161
+ """)
162
+
163
+
164
+ # GRAPHS
165
+ df['Status'] = df['Status'].map({'Developed': 1, 'Developing': 0})
166
+
167
+ def plot_by_country_development(data, value_column, value_title):
168
+
169
+ value_year = data.groupby(['Year', 'Status'])[value_column].mean().unstack('Status').fillna(0)
170
+ value_year.columns = ['Developing', 'Developed']
171
+
172
+ fig = go.Figure()
173
+ fig.add_trace(go.Scatter(x=value_year.index, y=value_year['Developing'], mode='lines', name='Developing', marker_color='#f075c2'))
174
+ fig.add_trace(go.Scatter(x=value_year.index, y=value_year['Developed'], mode='lines', name='Developed', marker_color='#28d2c2'))
175
+
176
+ fig.update_layout(height=500, xaxis_title="Years", yaxis_title=value_title,
177
+ title_text=f'{value_title} Average of Countries Over The Years',
178
+ template="plotly_dark")
179
+ return fig
180
+
181
+
182
+ st.plotly_chart(plot_by_country_development(df, 'Life_expectancy', 'Life Expectancy'))
183
+ st.plotly_chart(plot_by_country_development(df, 'Schooling', 'Schooling Level'))
184
+ st.plotly_chart(plot_by_country_development(df, 'Income_composition_of_resources', 'Income Composition of Resources'))
185
+
186
+ st.write("### Population Analysis")
187
+ fig_hiv = plot_by_country_development(df, 'Thinness_5-9_years', '5-9 years old population')
188
+ fig_diptheria = plot_by_country_development(df, 'Thinness_1-19_years', '1-19 years old population')
189
+ fig_polio = plot_by_country_development(df, 'Adult_mortality', ' Adult Mortality')
190
+ fig_hepatitisB = plot_by_country_development(df, 'Infant_deaths', 'Infant Deaths')
191
+
192
+ height = 400
193
+ width = 400
194
+ fig_hiv.update_layout(height=height,width=width)
195
+ fig_diptheria.update_layout(height=height, width=width)
196
+ fig_polio.update_layout(height=height, width=width)
197
+ fig_hepatitisB.update_layout(height=height, width=width)
198
+
199
+ col1, col2 = st.columns(2)
200
+
201
+ with col1:
202
+ st.plotly_chart(fig_hiv)
203
+ with col2:
204
+ st.plotly_chart(fig_diptheria)
205
+ with col1:
206
+ st.plotly_chart(fig_polio)
207
+ with col2:
208
+ st.plotly_chart(fig_hepatitisB)
209
+
210
+
211
+ st.write("### Diseases Analysis")
212
+ fig_hiv = plot_by_country_development(df, 'HIV/AIDS', 'HIV/AIDS')
213
+ fig_diptheria = plot_by_country_development(df, 'Diphtheria', 'Diphtheria')
214
+ fig_polio = plot_by_country_development(df, 'Polio', 'Polio')
215
+ fig_hepatitisB = plot_by_country_development(df, 'HepatitisB', 'HepatitisB')
216
+
217
+ height = 400
218
+ width = 400
219
+ fig_hiv.update_layout(height=height,width=width)
220
+ fig_diptheria.update_layout(height=height, width=width)
221
+ fig_polio.update_layout(height=height, width=width)
222
+ fig_hepatitisB.update_layout(height=height, width=width)
223
+
224
+ col1, col2 = st.columns(2)
225
+
226
+ with col1:
227
+ st.plotly_chart(fig_hiv)
228
+ with col2:
229
+ st.plotly_chart(fig_diptheria)
230
+ with col1:
231
+ st.plotly_chart(fig_polio)
232
+ with col2:
233
+ st.plotly_chart(fig_hepatitisB)
234
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pandas
2
+ matplotlib
3
+ plotly
4
+ seaborn
5
+ scipy
6
+ streamlit