Spaces:
Sleeping
Sleeping
# eda.py | |
import streamlit as st | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import phik # Ensure this library is installed | |
def load_and_preprocess_data(): | |
df = pd.read_csv("weatherAUS.csv") | |
return df | |
def plot_data_distributions(df): | |
sns.set_style('whitegrid') | |
for column in df.columns: | |
plt.figure(figsize=(8,4)) | |
if len(df[column].unique()) > 10: | |
sns.histplot(df[column], kde=True, color='skyblue') | |
plt.title(f'Distribution of {column}') | |
else: | |
sns.countplot(x=column, data=df, palette='Set2') | |
plt.title(f'Count of different classes in {column}') | |
st.pyplot(plt) | |
def calculate_phi_k_correlation(df): | |
phi_k_correlation = df.phik_matrix() | |
plt.figure(figsize=(12, 10)) | |
sns.heatmap(phi_k_correlation, annot=True, fmt=".2f", linewidths=.5, cmap='coolwarm') | |
plt.title('Phi_k Correlation Matrix Heatmap') | |
st.pyplot(plt) | |
def perform_temporal_feature_extraction(df): | |
df['Date'] = pd.to_datetime(df['Date']) | |
df['Year'] = df['Date'].dt.year | |
df['Month'] = df['Date'].dt.month | |
return df | |
def plot_annual_and_monthly_trends(df): | |
annual_trends = df.groupby('Year')[['MinTemp', 'MaxTemp', 'Rainfall', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm']].mean() | |
monthly_trends = df.groupby('Month')[['MinTemp', 'MaxTemp', 'Rainfall', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm']].mean() | |
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20, 10)) | |
annual_trends[['MinTemp', 'MaxTemp']].plot(ax=axes[0,0], title='Annual Avg Temperature') | |
annual_trends['Rainfall'].plot(ax=axes[0,1], title='Annual Avg Rainfall') | |
annual_trends[['Humidity9am', 'Humidity3pm']].plot(ax=axes[0,2], title='Annual Avg Humidity') | |
annual_trends[['Pressure9am', 'Pressure3pm']].plot(ax=axes[0,3], title='Annual Avg Pressure') | |
monthly_trends[['MinTemp', 'MaxTemp']].plot(ax=axes[1,0], title='Monthly Avg Temperature') | |
monthly_trends['Rainfall'].plot(ax=axes[1,1], title='Monthly Avg Rainfall') | |
monthly_trends[['Humidity9am', 'Humidity3pm']].plot(ax=axes[1,2], title='Monthly Avg Humidity') | |
monthly_trends[['Pressure9am', 'Pressure3pm']].plot(ax=axes[1,3], title='Monthly Avg Pressure') | |
plt.tight_layout() | |
st.pyplot(fig) | |
def perform_missing_value_analysis(df): | |
missing_values_total = df.isnull().sum() | |
missing_values_percentage = (df.isnull().sum() / len(df)) * 100 | |
missing_values_analysis = pd.DataFrame({'Total Missing': missing_values_total, 'Percentage Missing': missing_values_percentage}) | |
st.write(missing_values_analysis.sort_values(by='Percentage Missing', ascending=False)) | |
def perform_outlier_detection(df, key_columns): | |
outlier_analysis = {} | |
for col in key_columns: | |
Q1 = df[col].quantile(0.25) | |
Q3 = df[col].quantile(0.75) | |
IQR = Q3 - Q1 | |
lower_bound = Q1 - 1.5 * IQR | |
upper_bound = Q3 + 1.5 * IQR | |
outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)] | |
outlier_analysis[col] = { | |
"Outliers": outliers.shape[0], | |
"Percentage": (outliers.shape[0] / df.shape[0]) * 100 | |
} | |
st.write(outlier_analysis) | |
def perform_categorical_data_analysis(df): | |
categorical_columns = df.select_dtypes(include=['object']).columns | |
categorical_analysis = {col: df[col].value_counts() for col in categorical_columns} | |
st.write(categorical_analysis) | |
# Main Function | |
def main(): | |
st.title("Exploratory Data Analysis - Weather Forecasting") | |
# Load and preprocess data | |
df = load_and_preprocess_data() | |
# Extract temporal features | |
df = perform_temporal_feature_extraction(df) # This should be called before using 'Year' column | |
# Checkboxes and plotting functions | |
if st.sidebar.checkbox("Show Data Distributions"): | |
plot_data_distributions(df) | |
if st.sidebar.checkbox("Show Correlation Heatmap"): | |
calculate_phi_k_correlation(df) | |
if st.sidebar.checkbox("Show Annual and Monthly Trends"): | |
plot_annual_and_monthly_trends(df) | |
if st.sidebar.checkbox("Show Missing Value Analysis"): | |
perform_missing_value_analysis(df) | |
if st.sidebar.checkbox("Show Outlier Analysis"): | |
key_columns = ['MinTemp', 'MaxTemp', 'Rainfall', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm'] | |
perform_outlier_detection(df, key_columns) | |
if st.sidebar.checkbox("Show Categorical Data Analysis"): | |
perform_categorical_data_analysis(df) | |
if __name__ == '__main__': | |
main() | |