Spaces:
Sleeping
Sleeping
import pandas as pd | |
import matplotlib.pyplot as plt | |
import plotly.graph_objects as go | |
import numpy as np | |
import seaborn as sns | |
from scipy.stats.mstats import winsorize | |
import streamlit as st | |
df = pd.read_csv("Life Expectancy Data.csv") | |
st.title('Analyzing The World :earth_africa:') | |
st.write('**Below data is edited for better analysis and has 2900 rows.It gives life expectancy info for every country between the years 2000-2015. We will get so see the development or regession for each coutry and the world average.**') | |
df.rename(columns = {" BMI " :"BMI", | |
"Life expectancy ": "Life_expectancy", | |
"Adult Mortality":"Adult_mortality", | |
"infant deaths":"Infant_deaths", | |
"percentage expenditure":"Percentage_expenditure", | |
"Hepatitis B":"HepatitisB", | |
"Measles ":"Measles", | |
"under-five deaths ": "Under_five_deaths", | |
"Total expenditure":"Total_expenditure", | |
"Diphtheria ": "Diphtheria", | |
" thinness 1-19 years":"Thinness_1-19_years", | |
" thinness 5-9 years":"Thinness_5-9_years", | |
" HIV/AIDS":"HIV/AIDS", | |
"Income composition of resources":"Income_composition_of_resources"}, inplace = True) | |
df.groupby('Country').apply(lambda group: group.interpolate(method= 'linear')) | |
imputed_data = [] | |
for year in list(df.Year.unique()): | |
year_data = df[df.Year == year].copy() | |
for col in list(year_data.columns)[4:]: | |
year_data[col] = year_data[col].fillna(year_data[col].dropna().median()).copy() | |
imputed_data.append(year_data) | |
df = pd.concat(imputed_data).copy() | |
df['Life_expectancy'].fillna(df['Life_expectancy'].mean(), inplace=True) | |
df.reset_index(inplace=True) | |
df = df.drop('index', axis=1) | |
st.dataframe(df) | |
st.write('**For a better Analysis, we should also remove outliers. Lets see them first.**') | |
col_dict = {'Life_expectancy':1,'Adult_mortality':2,'Infant_deaths':3,'Alcohol':4,'Percentage_expenditure':5,'HepatitisB':6,'Measles':7,'BMI':8,'Under_five_deaths':9,'Polio':10,'Total_expenditure':11,'Diphtheria':12,'HIV/AIDS':13,'GDP':14,'Population':15,'Thinness_1-19_years':16,'Thinness_5-9_years':17,'Income_composition_of_resources':18,'Schooling':19} | |
fig = plt.figure(figsize=(20,30)) | |
for variable, i in col_dict.items(): | |
plt.subplot(5, 4, i) | |
plt.boxplot(df[variable]) | |
plt.title(variable) | |
plt.grid(True) | |
st.pyplot(fig) | |
st.write(""" | |
We'll remove outliers in Infant_Deaths, Measles, and Under_five_deaths columns since values beyond 1000 are unrealistic. | |
Similarly, we'll address extreme values in Expenditure_Percentage, GDP, and Population columns by taking logarithmic values. | |
BMI values above 40 indicate extreme obesity, and some countries have averages around 60, which is not possible. Therefore, we'll remove the entire BMI column. | |
For other columns with outliers, we'll apply winsorization for data normalization. | |
""") | |
# Remove outliers and log transform | |
df = df[df[['Infant_deaths', 'Measles', 'Under_five_deaths']].lt(1001).all(axis=1)] | |
df.drop('BMI', axis=1, inplace=True) | |
df[['Percentage_expenditure', 'Population', 'GDP']].apply(np.log) | |
df.replace([np.inf, -np.inf], 0, inplace=True) | |
# Winsorization | |
cols_to_winsorize = ['Life_expectancy', 'Adult_mortality', 'Alcohol', 'HepatitisB', 'Polio', 'Total_expenditure', | |
'Diphtheria', 'HIV/AIDS', 'Thinness_1-19_years', 'Thinness_5-9_years', | |
'Income_composition_of_resources', 'Schooling'] | |
winz_cols = [col for col in cols_to_winsorize] | |
df[winz_cols] = df[cols_to_winsorize].apply(lambda x: winsorize(x, limits=((0.05, 0) if x.name == 'Life_expectancy' else | |
(0, 0.04) if x.name == 'Adult_mortality' else | |
(0.0, 0.01) if x.name == 'Alcohol' else | |
(0.20, 0.0) if x.name == 'HepatitisB' else | |
(0.20, 0.0) if x.name == 'Polio' else | |
(0.0, 0.02) if x.name == 'Total_expenditure' else | |
(0.11, 0.0) if x.name == 'Diphtheria' else | |
(0.0, 0.21) if x.name == 'HIV/AIDS' else | |
(0.0, 0.04) if x.name == 'Thinness_1-19_years' else | |
(0.0, 0.04) if x.name == 'Thinness_5-9_years' else | |
(0.05, 0.0) if x.name == 'Income_composition_of_resources' else | |
(0.03, 0.01)), axis=0)) | |
# Plot boxplots for winsorized variables | |
fig, axs = plt.subplots(3, 6, figsize=(20, 20)) | |
cols_to_plot = winz_cols + ['Measles', 'Infant_deaths', 'Under_five_deaths', 'GDP', 'Population', 'Percentage_expenditure'] | |
for ax, col in zip(axs.flat, cols_to_plot): | |
sns.boxplot(y=df[col], ax=ax, color="green") | |
ax.set_title(col) | |
ax.set_ylabel('') | |
ax.grid(True) | |
plt.tight_layout() | |
st.pyplot(fig) | |
st.write('**Analysis**') | |
fig = plt.figure(figsize=(20, 20)) | |
for i, variable in enumerate(cols_to_plot, start=1): | |
plt.subplot(6, 6, i) | |
plt.hist(df[variable]) | |
plt.title(variable) | |
plt.ylabel('') | |
plt.grid(True) | |
st.pyplot(fig) | |
# Plot correlation heatmap | |
life_exp = cols_to_plot + ['Year'] | |
plt.figure(figsize=(15, 10)) | |
corr_matrix = df[life_exp].corr().values | |
st.pyplot(sns.heatmap(df[life_exp].corr(), annot=True, linewidths=4).figure) | |
# Get correlations | |
flattened_corr = corr_matrix.flatten() | |
sorted_corr_indices = np.argsort(flattened_corr) | |
top_25_pos_corr_indices = sorted_corr_indices[-70:-1] | |
top_25_pos_corr_indices = top_25_pos_corr_indices[::-1] | |
top_25_neg_corr_indices = sorted_corr_indices[:50] | |
# Create DataFrames for positive and negative correlations | |
corr_columns = df[life_exp].columns | |
corr_df = pd.DataFrame(columns=['1', '2', 'Correlation']) | |
neg_corr_df = pd.DataFrame(columns=['1', '2', 'Correlation']) | |
# Populate DataFrames | |
for idx in top_25_pos_corr_indices: | |
row, col = np.unravel_index(idx, corr_matrix.shape) | |
if row != col: | |
corr_df = pd.concat([corr_df, pd.DataFrame({'1': [corr_columns[row]], '2': [corr_columns[col]], 'Correlation': [corr_matrix[row, col]]})]) | |
for idx in top_25_neg_corr_indices: | |
row, col = np.unravel_index(idx, corr_matrix.shape) | |
if row != col: | |
neg_corr_df = pd.concat([neg_corr_df, pd.DataFrame({'1': [corr_columns[row]], '2': [corr_columns[col]], 'Correlation': [corr_matrix[row, col]]})]) | |
# Drop duplicates from both DataFrames | |
corr_df.drop_duplicates(subset=['Correlation'], inplace=True) | |
neg_corr_df.drop_duplicates(subset=['Correlation'], inplace=True) | |
# Display the top correlations | |
st.write("Top 25 Positive Correlations:") | |
st.dataframe(corr_df) | |
st.write("Top 25 Negative Correlations:") | |
st.dataframe(neg_corr_df) | |
st.write(""" | |
Key insights from the correlation analysis: | |
- Adult mortality exhibits a negative correlation with schooling and income composition, while it positively correlates with HIV/AIDS. | |
- Infant deaths and under-five deaths are strongly positively correlated. | |
- Schooling and alcohol consumption display a positive relationship. | |
- Percentage expenditure shows positive correlations with schooling, income composition, GDP, and life expectancy. | |
- Hepatitis B is strongly positively correlated with polio and diphtheria. | |
- Polio and diphtheria show strong positive correlations with each other and with life expectancy. | |
- Life expectancy is positively correlated with schooling, income composition, GDP, diphtheria, polio, and percentage expenditure. Conversely, it is negatively correlated with adult mortality, thinness in both age ranges, HIV/AIDS, under-five deaths, and infant deaths. | |
""") | |
# GRAPHS | |
df['Status'] = df['Status'].map({'Developed': 1, 'Developing': 0}) | |
def plot_by_country_development(data, value_column, value_title): | |
value_year = data.groupby(['Year', 'Status'])[value_column].mean().unstack('Status').fillna(0) | |
value_year.columns = ['Developing', 'Developed'] | |
fig = go.Figure() | |
fig.add_trace(go.Scatter(x=value_year.index, y=value_year['Developing'], mode='lines', name='Developing', marker_color='#f075c2')) | |
fig.add_trace(go.Scatter(x=value_year.index, y=value_year['Developed'], mode='lines', name='Developed', marker_color='#28d2c2')) | |
fig.update_layout(height=500, xaxis_title="Years", yaxis_title=value_title, | |
title_text=f'{value_title} Average of Countries Over The Years', | |
template="plotly_dark") | |
return fig | |
st.plotly_chart(plot_by_country_development(df, 'Life_expectancy', 'Life Expectancy')) | |
st.plotly_chart(plot_by_country_development(df, 'Schooling', 'Schooling Level')) | |
st.plotly_chart(plot_by_country_development(df, 'Income_composition_of_resources', 'Income Composition of Resources')) | |
st.write("### Population Analysis") | |
fig_hiv = plot_by_country_development(df, 'Thinness_5-9_years', '5-9 years old population') | |
fig_diptheria = plot_by_country_development(df, 'Thinness_1-19_years', '1-19 years old population') | |
fig_polio = plot_by_country_development(df, 'Adult_mortality', ' Adult Mortality') | |
fig_hepatitisB = plot_by_country_development(df, 'Infant_deaths', 'Infant Deaths') | |
height = 400 | |
width = 400 | |
fig_hiv.update_layout(height=height,width=width) | |
fig_diptheria.update_layout(height=height, width=width) | |
fig_polio.update_layout(height=height, width=width) | |
fig_hepatitisB.update_layout(height=height, width=width) | |
col1, col2 = st.columns(2) | |
with col1: | |
st.plotly_chart(fig_hiv) | |
with col2: | |
st.plotly_chart(fig_diptheria) | |
with col1: | |
st.plotly_chart(fig_polio) | |
with col2: | |
st.plotly_chart(fig_hepatitisB) | |
st.write("### Diseases Analysis") | |
fig_hiv = plot_by_country_development(df, 'HIV/AIDS', 'HIV/AIDS') | |
fig_diptheria = plot_by_country_development(df, 'Diphtheria', 'Diphtheria') | |
fig_polio = plot_by_country_development(df, 'Polio', 'Polio') | |
fig_hepatitisB = plot_by_country_development(df, 'HepatitisB', 'HepatitisB') | |
height = 400 | |
width = 400 | |
fig_hiv.update_layout(height=height,width=width) | |
fig_diptheria.update_layout(height=height, width=width) | |
fig_polio.update_layout(height=height, width=width) | |
fig_hepatitisB.update_layout(height=height, width=width) | |
col1, col2 = st.columns(2) | |
with col1: | |
st.plotly_chart(fig_hiv) | |
with col2: | |
st.plotly_chart(fig_diptheria) | |
with col1: | |
st.plotly_chart(fig_polio) | |
with col2: | |
st.plotly_chart(fig_hepatitisB) | |