Spaces:
Sleeping
Sleeping
ibrahimnomad
commited on
Commit
•
541ab20
1
Parent(s):
133e276
Upload 3 files
Browse files- Life Expectancy Data.csv +0 -0
- app.py +234 -0
- requirements.txt +6 -0
Life Expectancy Data.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import plotly.graph_objects as go
|
4 |
+
import numpy as np
|
5 |
+
import seaborn as sns
|
6 |
+
from scipy.stats.mstats import winsorize
|
7 |
+
import streamlit as st
|
8 |
+
df = pd.read_csv("Life Expectancy Data.csv")
|
9 |
+
|
10 |
+
st.title('Analyzing The World :earth_africa:')
|
11 |
+
st.write('**Below data is edited for better analysis and has 2900 rows.It gives life expectancy info for every country between the years 2000-2015. We will get so see the development or regession for each coutry and the world average.**')
|
12 |
+
df.rename(columns = {" BMI " :"BMI",
|
13 |
+
"Life expectancy ": "Life_expectancy",
|
14 |
+
"Adult Mortality":"Adult_mortality",
|
15 |
+
"infant deaths":"Infant_deaths",
|
16 |
+
"percentage expenditure":"Percentage_expenditure",
|
17 |
+
"Hepatitis B":"HepatitisB",
|
18 |
+
"Measles ":"Measles",
|
19 |
+
"under-five deaths ": "Under_five_deaths",
|
20 |
+
"Total expenditure":"Total_expenditure",
|
21 |
+
"Diphtheria ": "Diphtheria",
|
22 |
+
" thinness 1-19 years":"Thinness_1-19_years",
|
23 |
+
" thinness 5-9 years":"Thinness_5-9_years",
|
24 |
+
" HIV/AIDS":"HIV/AIDS",
|
25 |
+
"Income composition of resources":"Income_composition_of_resources"}, inplace = True)
|
26 |
+
|
27 |
+
df.groupby('Country').apply(lambda group: group.interpolate(method= 'linear'))
|
28 |
+
imputed_data = []
|
29 |
+
for year in list(df.Year.unique()):
|
30 |
+
year_data = df[df.Year == year].copy()
|
31 |
+
for col in list(year_data.columns)[4:]:
|
32 |
+
year_data[col] = year_data[col].fillna(year_data[col].dropna().median()).copy()
|
33 |
+
imputed_data.append(year_data)
|
34 |
+
df = pd.concat(imputed_data).copy()
|
35 |
+
df['Life_expectancy'].fillna(df['Life_expectancy'].mean(), inplace=True)
|
36 |
+
df.reset_index(inplace=True)
|
37 |
+
df = df.drop('index', axis=1)
|
38 |
+
|
39 |
+
st.dataframe(df)
|
40 |
+
|
41 |
+
st.write('**For a better Analysis, we should also remove outliers. Lets see them first.**')
|
42 |
+
col_dict = {'Life_expectancy':1,'Adult_mortality':2,'Infant_deaths':3,'Alcohol':4,'Percentage_expenditure':5,'HepatitisB':6,'Measles':7,'BMI':8,'Under_five_deaths':9,'Polio':10,'Total_expenditure':11,'Diphtheria':12,'HIV/AIDS':13,'GDP':14,'Population':15,'Thinness_1-19_years':16,'Thinness_5-9_years':17,'Income_composition_of_resources':18,'Schooling':19}
|
43 |
+
fig = plt.figure(figsize=(20,30))
|
44 |
+
for variable, i in col_dict.items():
|
45 |
+
plt.subplot(5, 4, i)
|
46 |
+
plt.boxplot(df[variable])
|
47 |
+
plt.title(variable)
|
48 |
+
plt.grid(True)
|
49 |
+
st.pyplot(fig)
|
50 |
+
|
51 |
+
st.write("""
|
52 |
+
We'll remove outliers in Infant_Deaths, Measles, and Under_five_deaths columns since values beyond 1000 are unrealistic.
|
53 |
+
|
54 |
+
Similarly, we'll address extreme values in Expenditure_Percentage, GDP, and Population columns by taking logarithmic values.
|
55 |
+
|
56 |
+
BMI values above 40 indicate extreme obesity, and some countries have averages around 60, which is not possible. Therefore, we'll remove the entire BMI column.
|
57 |
+
|
58 |
+
For other columns with outliers, we'll apply winsorization for data normalization.
|
59 |
+
""")
|
60 |
+
|
61 |
+
# Remove outliers and log transform
|
62 |
+
df = df[df[['Infant_deaths', 'Measles', 'Under_five_deaths']].lt(1001).all(axis=1)]
|
63 |
+
df.drop('BMI', axis=1, inplace=True)
|
64 |
+
df[['Percentage_expenditure', 'Population', 'GDP']].apply(np.log)
|
65 |
+
df.replace([np.inf, -np.inf], 0, inplace=True)
|
66 |
+
|
67 |
+
# Winsorization
|
68 |
+
cols_to_winsorize = ['Life_expectancy', 'Adult_mortality', 'Alcohol', 'HepatitisB', 'Polio', 'Total_expenditure',
|
69 |
+
'Diphtheria', 'HIV/AIDS', 'Thinness_1-19_years', 'Thinness_5-9_years',
|
70 |
+
'Income_composition_of_resources', 'Schooling']
|
71 |
+
|
72 |
+
winz_cols = [col for col in cols_to_winsorize]
|
73 |
+
df[winz_cols] = df[cols_to_winsorize].apply(lambda x: winsorize(x, limits=((0.05, 0) if x.name == 'Life_expectancy' else
|
74 |
+
(0, 0.04) if x.name == 'Adult_mortality' else
|
75 |
+
(0.0, 0.01) if x.name == 'Alcohol' else
|
76 |
+
(0.20, 0.0) if x.name == 'HepatitisB' else
|
77 |
+
(0.20, 0.0) if x.name == 'Polio' else
|
78 |
+
(0.0, 0.02) if x.name == 'Total_expenditure' else
|
79 |
+
(0.11, 0.0) if x.name == 'Diphtheria' else
|
80 |
+
(0.0, 0.21) if x.name == 'HIV/AIDS' else
|
81 |
+
(0.0, 0.04) if x.name == 'Thinness_1-19_years' else
|
82 |
+
(0.0, 0.04) if x.name == 'Thinness_5-9_years' else
|
83 |
+
(0.05, 0.0) if x.name == 'Income_composition_of_resources' else
|
84 |
+
(0.03, 0.01)), axis=0))
|
85 |
+
|
86 |
+
# Plot boxplots for winsorized variables
|
87 |
+
fig, axs = plt.subplots(3, 6, figsize=(20, 20))
|
88 |
+
cols_to_plot = winz_cols + ['Measles', 'Infant_deaths', 'Under_five_deaths', 'GDP', 'Population', 'Percentage_expenditure']
|
89 |
+
for ax, col in zip(axs.flat, cols_to_plot):
|
90 |
+
sns.boxplot(y=df[col], ax=ax, color="green")
|
91 |
+
ax.set_title(col)
|
92 |
+
ax.set_ylabel('')
|
93 |
+
ax.grid(True)
|
94 |
+
|
95 |
+
plt.tight_layout()
|
96 |
+
st.pyplot(fig)
|
97 |
+
|
98 |
+
st.write('**Analysis**')
|
99 |
+
|
100 |
+
fig = plt.figure(figsize=(20, 20))
|
101 |
+
for i, variable in enumerate(cols_to_plot, start=1):
|
102 |
+
plt.subplot(6, 6, i)
|
103 |
+
plt.hist(df[variable])
|
104 |
+
plt.title(variable)
|
105 |
+
plt.ylabel('')
|
106 |
+
plt.grid(True)
|
107 |
+
st.pyplot(fig)
|
108 |
+
|
109 |
+
# Plot correlation heatmap
|
110 |
+
life_exp = cols_to_plot + ['Year']
|
111 |
+
plt.figure(figsize=(15, 10))
|
112 |
+
corr_matrix = df[life_exp].corr().values
|
113 |
+
st.pyplot(sns.heatmap(df[life_exp].corr(), annot=True, linewidths=4).figure)
|
114 |
+
|
115 |
+
|
116 |
+
|
117 |
+
# Get correlations
|
118 |
+
flattened_corr = corr_matrix.flatten()
|
119 |
+
sorted_corr_indices = np.argsort(flattened_corr)
|
120 |
+
top_25_pos_corr_indices = sorted_corr_indices[-70:-1]
|
121 |
+
top_25_pos_corr_indices = top_25_pos_corr_indices[::-1]
|
122 |
+
top_25_neg_corr_indices = sorted_corr_indices[:50]
|
123 |
+
|
124 |
+
# Create DataFrames for positive and negative correlations
|
125 |
+
corr_columns = df[life_exp].columns
|
126 |
+
corr_df = pd.DataFrame(columns=['1', '2', 'Correlation'])
|
127 |
+
neg_corr_df = pd.DataFrame(columns=['1', '2', 'Correlation'])
|
128 |
+
|
129 |
+
# Populate DataFrames
|
130 |
+
for idx in top_25_pos_corr_indices:
|
131 |
+
row, col = np.unravel_index(idx, corr_matrix.shape)
|
132 |
+
if row != col:
|
133 |
+
corr_df = pd.concat([corr_df, pd.DataFrame({'1': [corr_columns[row]], '2': [corr_columns[col]], 'Correlation': [corr_matrix[row, col]]})])
|
134 |
+
|
135 |
+
for idx in top_25_neg_corr_indices:
|
136 |
+
row, col = np.unravel_index(idx, corr_matrix.shape)
|
137 |
+
if row != col:
|
138 |
+
neg_corr_df = pd.concat([neg_corr_df, pd.DataFrame({'1': [corr_columns[row]], '2': [corr_columns[col]], 'Correlation': [corr_matrix[row, col]]})])
|
139 |
+
|
140 |
+
# Drop duplicates from both DataFrames
|
141 |
+
corr_df.drop_duplicates(subset=['Correlation'], inplace=True)
|
142 |
+
neg_corr_df.drop_duplicates(subset=['Correlation'], inplace=True)
|
143 |
+
|
144 |
+
# Display the top correlations
|
145 |
+
st.write("Top 25 Positive Correlations:")
|
146 |
+
st.dataframe(corr_df)
|
147 |
+
|
148 |
+
st.write("Top 25 Negative Correlations:")
|
149 |
+
st.dataframe(neg_corr_df)
|
150 |
+
|
151 |
+
st.write("""
|
152 |
+
Key insights from the correlation analysis:
|
153 |
+
|
154 |
+
- Adult mortality exhibits a negative correlation with schooling and income composition, while it positively correlates with HIV/AIDS.
|
155 |
+
- Infant deaths and under-five deaths are strongly positively correlated.
|
156 |
+
- Schooling and alcohol consumption display a positive relationship.
|
157 |
+
- Percentage expenditure shows positive correlations with schooling, income composition, GDP, and life expectancy.
|
158 |
+
- Hepatitis B is strongly positively correlated with polio and diphtheria.
|
159 |
+
- Polio and diphtheria show strong positive correlations with each other and with life expectancy.
|
160 |
+
- Life expectancy is positively correlated with schooling, income composition, GDP, diphtheria, polio, and percentage expenditure. Conversely, it is negatively correlated with adult mortality, thinness in both age ranges, HIV/AIDS, under-five deaths, and infant deaths.
|
161 |
+
""")
|
162 |
+
|
163 |
+
|
164 |
+
# GRAPHS
|
165 |
+
df['Status'] = df['Status'].map({'Developed': 1, 'Developing': 0})
|
166 |
+
|
167 |
+
def plot_by_country_development(data, value_column, value_title):
|
168 |
+
|
169 |
+
value_year = data.groupby(['Year', 'Status'])[value_column].mean().unstack('Status').fillna(0)
|
170 |
+
value_year.columns = ['Developing', 'Developed']
|
171 |
+
|
172 |
+
fig = go.Figure()
|
173 |
+
fig.add_trace(go.Scatter(x=value_year.index, y=value_year['Developing'], mode='lines', name='Developing', marker_color='#f075c2'))
|
174 |
+
fig.add_trace(go.Scatter(x=value_year.index, y=value_year['Developed'], mode='lines', name='Developed', marker_color='#28d2c2'))
|
175 |
+
|
176 |
+
fig.update_layout(height=500, xaxis_title="Years", yaxis_title=value_title,
|
177 |
+
title_text=f'{value_title} Average of Countries Over The Years',
|
178 |
+
template="plotly_dark")
|
179 |
+
return fig
|
180 |
+
|
181 |
+
|
182 |
+
st.plotly_chart(plot_by_country_development(df, 'Life_expectancy', 'Life Expectancy'))
|
183 |
+
st.plotly_chart(plot_by_country_development(df, 'Schooling', 'Schooling Level'))
|
184 |
+
st.plotly_chart(plot_by_country_development(df, 'Income_composition_of_resources', 'Income Composition of Resources'))
|
185 |
+
|
186 |
+
st.write("### Population Analysis")
|
187 |
+
fig_hiv = plot_by_country_development(df, 'Thinness_5-9_years', '5-9 years old population')
|
188 |
+
fig_diptheria = plot_by_country_development(df, 'Thinness_1-19_years', '1-19 years old population')
|
189 |
+
fig_polio = plot_by_country_development(df, 'Adult_mortality', ' Adult Mortality')
|
190 |
+
fig_hepatitisB = plot_by_country_development(df, 'Infant_deaths', 'Infant Deaths')
|
191 |
+
|
192 |
+
height = 400
|
193 |
+
width = 400
|
194 |
+
fig_hiv.update_layout(height=height,width=width)
|
195 |
+
fig_diptheria.update_layout(height=height, width=width)
|
196 |
+
fig_polio.update_layout(height=height, width=width)
|
197 |
+
fig_hepatitisB.update_layout(height=height, width=width)
|
198 |
+
|
199 |
+
col1, col2 = st.columns(2)
|
200 |
+
|
201 |
+
with col1:
|
202 |
+
st.plotly_chart(fig_hiv)
|
203 |
+
with col2:
|
204 |
+
st.plotly_chart(fig_diptheria)
|
205 |
+
with col1:
|
206 |
+
st.plotly_chart(fig_polio)
|
207 |
+
with col2:
|
208 |
+
st.plotly_chart(fig_hepatitisB)
|
209 |
+
|
210 |
+
|
211 |
+
st.write("### Diseases Analysis")
|
212 |
+
fig_hiv = plot_by_country_development(df, 'HIV/AIDS', 'HIV/AIDS')
|
213 |
+
fig_diptheria = plot_by_country_development(df, 'Diphtheria', 'Diphtheria')
|
214 |
+
fig_polio = plot_by_country_development(df, 'Polio', 'Polio')
|
215 |
+
fig_hepatitisB = plot_by_country_development(df, 'HepatitisB', 'HepatitisB')
|
216 |
+
|
217 |
+
height = 400
|
218 |
+
width = 400
|
219 |
+
fig_hiv.update_layout(height=height,width=width)
|
220 |
+
fig_diptheria.update_layout(height=height, width=width)
|
221 |
+
fig_polio.update_layout(height=height, width=width)
|
222 |
+
fig_hepatitisB.update_layout(height=height, width=width)
|
223 |
+
|
224 |
+
col1, col2 = st.columns(2)
|
225 |
+
|
226 |
+
with col1:
|
227 |
+
st.plotly_chart(fig_hiv)
|
228 |
+
with col2:
|
229 |
+
st.plotly_chart(fig_diptheria)
|
230 |
+
with col1:
|
231 |
+
st.plotly_chart(fig_polio)
|
232 |
+
with col2:
|
233 |
+
st.plotly_chart(fig_hepatitisB)
|
234 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
matplotlib
|
3 |
+
plotly
|
4 |
+
seaborn
|
5 |
+
scipy
|
6 |
+
streamlit
|