Spaces:

ibrahimnomad
/

World_Data_Analysis

Sleeping

App Files Files Community

ibrahimnomad commited on Apr 30

Commit

541ab20

•

1 Parent(s): 133e276

Upload 3 files

Browse files

Files changed (3) hide show

Life Expectancy Data.csv +0 -0
app.py +234 -0
requirements.txt +6 -0

Life Expectancy Data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import pandas as pd
+import matplotlib.pyplot as plt
+import plotly.graph_objects as go
+import numpy as np
+import seaborn as sns
+from scipy.stats.mstats import winsorize
+import streamlit as st
+df = pd.read_csv("Life Expectancy Data.csv")
+st.title('Analyzing The World :earth_africa:')
+st.write('**Below data is edited for better analysis and has 2900 rows.It gives life expectancy info for every country between the years 2000-2015. We will get so see the development or regession for each coutry and the world average.**')
+df.rename(columns = {" BMI " :"BMI",
+                                  "Life expectancy ": "Life_expectancy",
+                                  "Adult Mortality":"Adult_mortality",
+                                  "infant deaths":"Infant_deaths",
+                                  "percentage expenditure":"Percentage_expenditure",
+                                  "Hepatitis B":"HepatitisB",
+                                  "Measles ":"Measles",
+                                  "under-five deaths ": "Under_five_deaths",
+                                  "Total expenditure":"Total_expenditure",
+                                  "Diphtheria ": "Diphtheria",
+                                  " thinness  1-19 years":"Thinness_1-19_years",
+                                  " thinness 5-9 years":"Thinness_5-9_years",
+                                  " HIV/AIDS":"HIV/AIDS",
+                                  "Income composition of resources":"Income_composition_of_resources"}, inplace = True)
+df.groupby('Country').apply(lambda group: group.interpolate(method= 'linear'))
+imputed_data = []
+for year in list(df.Year.unique()):
+    year_data = df[df.Year == year].copy()
+    for col in list(year_data.columns)[4:]:
+        year_data[col] = year_data[col].fillna(year_data[col].dropna().median()).copy()
+    imputed_data.append(year_data)
+df = pd.concat(imputed_data).copy()
+df['Life_expectancy'].fillna(df['Life_expectancy'].mean(), inplace=True)
+df.reset_index(inplace=True)
+df = df.drop('index', axis=1)
+st.dataframe(df)
+st.write('**For a better Analysis, we should also remove outliers. Lets see them first.**')
+col_dict = {'Life_expectancy':1,'Adult_mortality':2,'Infant_deaths':3,'Alcohol':4,'Percentage_expenditure':5,'HepatitisB':6,'Measles':7,'BMI':8,'Under_five_deaths':9,'Polio':10,'Total_expenditure':11,'Diphtheria':12,'HIV/AIDS':13,'GDP':14,'Population':15,'Thinness_1-19_years':16,'Thinness_5-9_years':17,'Income_composition_of_resources':18,'Schooling':19}
+fig = plt.figure(figsize=(20,30))
+for variable, i in col_dict.items():
+    plt.subplot(5, 4, i)
+    plt.boxplot(df[variable])
+    plt.title(variable)
+    plt.grid(True)
+st.pyplot(fig)
+st.write("""
+We'll remove outliers in Infant_Deaths, Measles, and Under_five_deaths columns since values beyond 1000 are unrealistic.
+Similarly, we'll address extreme values in Expenditure_Percentage, GDP, and Population columns by taking logarithmic values.
+BMI values above 40 indicate extreme obesity, and some countries have averages around 60, which is not possible. Therefore, we'll remove the entire BMI column.
+For other columns with outliers, we'll apply winsorization for data normalization.
+""")
+# Remove outliers and log transform
+df = df[df[['Infant_deaths', 'Measles', 'Under_five_deaths']].lt(1001).all(axis=1)]
+df.drop('BMI', axis=1, inplace=True)
+df[['Percentage_expenditure', 'Population', 'GDP']].apply(np.log)
+df.replace([np.inf, -np.inf], 0, inplace=True)
+# Winsorization
+cols_to_winsorize = ['Life_expectancy', 'Adult_mortality', 'Alcohol', 'HepatitisB', 'Polio', 'Total_expenditure',
+                     'Diphtheria', 'HIV/AIDS', 'Thinness_1-19_years', 'Thinness_5-9_years',
+                     'Income_composition_of_resources', 'Schooling']
+winz_cols = [col for col in cols_to_winsorize]
+df[winz_cols] = df[cols_to_winsorize].apply(lambda x: winsorize(x, limits=((0.05, 0) if x.name == 'Life_expectancy' else
+                                                                             (0, 0.04) if x.name == 'Adult_mortality' else
+                                                                             (0.0, 0.01) if x.name == 'Alcohol' else
+                                                                             (0.20, 0.0) if x.name == 'HepatitisB' else
+                                                                             (0.20, 0.0) if x.name == 'Polio' else
+                                                                             (0.0, 0.02) if x.name == 'Total_expenditure' else
+                                                                             (0.11, 0.0) if x.name == 'Diphtheria' else
+                                                                             (0.0, 0.21) if x.name == 'HIV/AIDS' else
+                                                                             (0.0, 0.04) if x.name == 'Thinness_1-19_years' else
+                                                                             (0.0, 0.04) if x.name == 'Thinness_5-9_years' else
+                                                                             (0.05, 0.0) if x.name == 'Income_composition_of_resources' else
+                                                                             (0.03, 0.01)), axis=0))
+# Plot boxplots for winsorized variables
+fig, axs = plt.subplots(3, 6, figsize=(20, 20))
+cols_to_plot = winz_cols + ['Measles', 'Infant_deaths', 'Under_five_deaths', 'GDP', 'Population', 'Percentage_expenditure']
+for ax, col in zip(axs.flat, cols_to_plot):
+    sns.boxplot(y=df[col], ax=ax, color="green")
+    ax.set_title(col)
+    ax.set_ylabel('')
+    ax.grid(True)
+plt.tight_layout()
+st.pyplot(fig)
+st.write('**Analysis**')
+fig = plt.figure(figsize=(20, 20))
+for i, variable in enumerate(cols_to_plot, start=1):
+    plt.subplot(6, 6, i)
+    plt.hist(df[variable])
+    plt.title(variable)
+    plt.ylabel('')
+    plt.grid(True)
+st.pyplot(fig)
+# Plot correlation heatmap
+life_exp = cols_to_plot + ['Year']
+plt.figure(figsize=(15, 10))
+corr_matrix = df[life_exp].corr().values
+st.pyplot(sns.heatmap(df[life_exp].corr(), annot=True, linewidths=4).figure)
+# Get correlations
+flattened_corr = corr_matrix.flatten()
+sorted_corr_indices = np.argsort(flattened_corr)
+top_25_pos_corr_indices = sorted_corr_indices[-70:-1]
+top_25_pos_corr_indices = top_25_pos_corr_indices[::-1]
+top_25_neg_corr_indices = sorted_corr_indices[:50]
+# Create DataFrames for positive and negative correlations
+corr_columns = df[life_exp].columns
+corr_df = pd.DataFrame(columns=['1', '2', 'Correlation'])
+neg_corr_df = pd.DataFrame(columns=['1', '2', 'Correlation'])
+# Populate DataFrames
+for idx in top_25_pos_corr_indices:
+    row, col = np.unravel_index(idx, corr_matrix.shape)
+    if row != col:
+        corr_df = pd.concat([corr_df, pd.DataFrame({'1': [corr_columns[row]], '2': [corr_columns[col]], 'Correlation': [corr_matrix[row, col]]})])
+for idx in top_25_neg_corr_indices:
+    row, col = np.unravel_index(idx, corr_matrix.shape)
+    if row != col:
+        neg_corr_df = pd.concat([neg_corr_df, pd.DataFrame({'1': [corr_columns[row]], '2': [corr_columns[col]], 'Correlation': [corr_matrix[row, col]]})])
+# Drop duplicates from both DataFrames
+corr_df.drop_duplicates(subset=['Correlation'], inplace=True)
+neg_corr_df.drop_duplicates(subset=['Correlation'], inplace=True)
+# Display the top correlations
+st.write("Top 25 Positive Correlations:")
+st.dataframe(corr_df)
+st.write("Top 25 Negative Correlations:")
+st.dataframe(neg_corr_df)
+st.write("""
+Key insights from the correlation analysis:
+- Adult mortality exhibits a negative correlation with schooling and income composition, while it positively correlates with HIV/AIDS.
+- Infant deaths and under-five deaths are strongly positively correlated.
+- Schooling and alcohol consumption display a positive relationship.
+- Percentage expenditure shows positive correlations with schooling, income composition, GDP, and life expectancy.
+- Hepatitis B is strongly positively correlated with polio and diphtheria.
+- Polio and diphtheria show strong positive correlations with each other and with life expectancy.
+- Life expectancy is positively correlated with schooling, income composition, GDP, diphtheria, polio, and percentage expenditure. Conversely, it is negatively correlated with adult mortality, thinness in both age ranges, HIV/AIDS, under-five deaths, and infant deaths.
+""")
+# GRAPHS
+df['Status'] = df['Status'].map({'Developed': 1, 'Developing': 0})
+def plot_by_country_development(data, value_column, value_title):
+    value_year = data.groupby(['Year', 'Status'])[value_column].mean().unstack('Status').fillna(0)
+    value_year.columns = ['Developing', 'Developed']
+    fig = go.Figure()
+    fig.add_trace(go.Scatter(x=value_year.index, y=value_year['Developing'], mode='lines', name='Developing', marker_color='#f075c2'))
+    fig.add_trace(go.Scatter(x=value_year.index, y=value_year['Developed'], mode='lines', name='Developed', marker_color='#28d2c2'))
+    fig.update_layout(height=500, xaxis_title="Years", yaxis_title=value_title,
+                      title_text=f'{value_title} Average of Countries Over The Years',
+                      template="plotly_dark")
+    return fig
+st.plotly_chart(plot_by_country_development(df, 'Life_expectancy', 'Life Expectancy'))
+st.plotly_chart(plot_by_country_development(df, 'Schooling', 'Schooling Level'))
+st.plotly_chart(plot_by_country_development(df, 'Income_composition_of_resources', 'Income Composition of Resources'))
+st.write("### Population Analysis")
+fig_hiv = plot_by_country_development(df, 'Thinness_5-9_years', '5-9 years old population')
+fig_diptheria = plot_by_country_development(df, 'Thinness_1-19_years', '1-19 years old population')
+fig_polio = plot_by_country_development(df, 'Adult_mortality', ' Adult Mortality')
+fig_hepatitisB = plot_by_country_development(df, 'Infant_deaths', 'Infant Deaths')
+height = 400
+width = 400
+fig_hiv.update_layout(height=height,width=width)
+fig_diptheria.update_layout(height=height, width=width)
+fig_polio.update_layout(height=height, width=width)
+fig_hepatitisB.update_layout(height=height, width=width)
+col1, col2 = st.columns(2)
+with col1:
+    st.plotly_chart(fig_hiv)
+with col2:
+    st.plotly_chart(fig_diptheria)
+with col1:
+    st.plotly_chart(fig_polio)
+with col2:
+    st.plotly_chart(fig_hepatitisB)
+st.write("### Diseases Analysis")
+fig_hiv = plot_by_country_development(df, 'HIV/AIDS', 'HIV/AIDS')
+fig_diptheria = plot_by_country_development(df, 'Diphtheria', 'Diphtheria')
+fig_polio = plot_by_country_development(df, 'Polio', 'Polio')
+fig_hepatitisB = plot_by_country_development(df, 'HepatitisB', 'HepatitisB')
+height = 400
+width = 400
+fig_hiv.update_layout(height=height,width=width)
+fig_diptheria.update_layout(height=height, width=width)
+fig_polio.update_layout(height=height, width=width)
+fig_hepatitisB.update_layout(height=height, width=width)
+col1, col2 = st.columns(2)
+with col1:
+    st.plotly_chart(fig_hiv)
+with col2:
+    st.plotly_chart(fig_diptheria)
+with col1:
+    st.plotly_chart(fig_polio)
+with col2:
+    st.plotly_chart(fig_hepatitisB)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+pandas
+matplotlib
+plotly
+seaborn
+scipy
+streamlit