Spaces:
Sleeping
Sleeping
7sugiwa
commited on
Commit
•
a9d560d
1
Parent(s):
cce1668
Add application file
Browse files- app.py +47 -0
- eda.py +108 -0
- prediction.py +32 -0
app.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app.py
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
import prediction
|
6 |
+
import eda # Import the eda module
|
7 |
+
|
8 |
+
# Set up the main structure of the app
|
9 |
+
def main():
|
10 |
+
st.title("Rainfall Prediction in Australia")
|
11 |
+
st.write("This app predicts whether it will rain tomorrow in Australia based on weather data.")
|
12 |
+
|
13 |
+
# Sidebar navigation
|
14 |
+
st.sidebar.title("Navigation")
|
15 |
+
app_mode = st.sidebar.selectbox("Choose the app mode",
|
16 |
+
["Home", "Exploratory Data Analysis", "Make a Prediction"])
|
17 |
+
|
18 |
+
if app_mode == "Home":
|
19 |
+
st.write("Welcome to the Weather Forecasting Application!")
|
20 |
+
st.write("Navigate to different sections using the sidebar.")
|
21 |
+
elif app_mode == "Exploratory Data Analysis":
|
22 |
+
st.subheader("Exploratory Data Analysis")
|
23 |
+
# Call the EDA function from eda.py
|
24 |
+
eda.main() # Call the main function from eda.py
|
25 |
+
elif app_mode == "Make a Prediction":
|
26 |
+
st.subheader("Make a Prediction")
|
27 |
+
# Get user input for prediction
|
28 |
+
user_input = get_user_input()
|
29 |
+
if st.button("Predict"):
|
30 |
+
# Call the prediction function from prediction.py
|
31 |
+
result = prediction.predict_rainfall(*user_input)
|
32 |
+
st.write(f"Prediction: {'It will rain tomorrow.' if result else 'No rain tomorrow.'}")
|
33 |
+
|
34 |
+
def get_user_input():
|
35 |
+
humidity_3pm = st.number_input('Humidity at 3 PM', min_value=0, max_value=100, value=50)
|
36 |
+
rainfall = st.number_input('Rainfall (mm)', min_value=0.0, max_value=1000.0, value=0.0)
|
37 |
+
rain_today = st.selectbox('Did it rain today?', options=['Yes', 'No'])
|
38 |
+
temp_range = st.number_input('Temperature Range (°C)', min_value=0.0, max_value=50.0, value=10.0)
|
39 |
+
wind_gust_speed = st.number_input('Wind Gust Speed (km/h)', min_value=0, max_value=100, value=20)
|
40 |
+
pressure_9am = st.number_input('Pressure at 9 AM (hPa)', min_value=980, max_value=1040, value=1010)
|
41 |
+
avg_pressure = st.number_input('Average Daily Pressure (hPa)', min_value=980, max_value=1040, value=1010)
|
42 |
+
humidity_change = st.number_input('Change in Humidity', min_value=-100, max_value=100, value=0)
|
43 |
+
avg_humidity = st.number_input('Average Daily Humidity', min_value=0, max_value=100, value=50)
|
44 |
+
return humidity_3pm, rainfall, rain_today, temp_range, wind_gust_speed, pressure_9am, avg_pressure, humidity_change, avg_humidity
|
45 |
+
|
46 |
+
if __name__ == "__main__":
|
47 |
+
main()
|
eda.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# eda.py
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
import phik # Ensure this library is installed
|
7 |
+
|
8 |
+
@st.cache_data
|
9 |
+
def load_and_preprocess_data():
|
10 |
+
df = pd.read_csv("weatherAUS.csv")
|
11 |
+
return df
|
12 |
+
|
13 |
+
def plot_data_distributions(df):
|
14 |
+
sns.set_style('whitegrid')
|
15 |
+
for column in df.columns:
|
16 |
+
plt.figure(figsize=(8,4))
|
17 |
+
if len(df[column].unique()) > 10:
|
18 |
+
sns.histplot(df[column], kde=True, color='skyblue')
|
19 |
+
plt.title(f'Distribution of {column}')
|
20 |
+
else:
|
21 |
+
sns.countplot(x=column, data=df, palette='Set2')
|
22 |
+
plt.title(f'Count of different classes in {column}')
|
23 |
+
st.pyplot(plt)
|
24 |
+
|
25 |
+
def calculate_phi_k_correlation(df):
|
26 |
+
phi_k_correlation = df.phik_matrix()
|
27 |
+
plt.figure(figsize=(12, 10))
|
28 |
+
sns.heatmap(phi_k_correlation, annot=True, fmt=".2f", linewidths=.5, cmap='coolwarm')
|
29 |
+
plt.title('Phi_k Correlation Matrix Heatmap')
|
30 |
+
st.pyplot(plt)
|
31 |
+
|
32 |
+
def perform_temporal_feature_extraction(df):
|
33 |
+
df['Date'] = pd.to_datetime(df['Date'])
|
34 |
+
df['Year'] = df['Date'].dt.year
|
35 |
+
df['Month'] = df['Date'].dt.month
|
36 |
+
return df
|
37 |
+
|
38 |
+
def plot_annual_and_monthly_trends(df):
|
39 |
+
annual_trends = df.groupby('Year')[['MinTemp', 'MaxTemp', 'Rainfall', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm']].mean()
|
40 |
+
monthly_trends = df.groupby('Month')[['MinTemp', 'MaxTemp', 'Rainfall', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm']].mean()
|
41 |
+
|
42 |
+
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))
|
43 |
+
annual_trends[['MinTemp', 'MaxTemp']].plot(ax=axes[0,0], title='Annual Avg Temperature')
|
44 |
+
annual_trends['Rainfall'].plot(ax=axes[0,1], title='Annual Avg Rainfall')
|
45 |
+
annual_trends[['Humidity9am', 'Humidity3pm']].plot(ax=axes[0,2], title='Annual Avg Humidity')
|
46 |
+
annual_trends[['Pressure9am', 'Pressure3pm']].plot(ax=axes[0,3], title='Annual Avg Pressure')
|
47 |
+
monthly_trends[['MinTemp', 'MaxTemp']].plot(ax=axes[1,0], title='Monthly Avg Temperature')
|
48 |
+
monthly_trends['Rainfall'].plot(ax=axes[1,1], title='Monthly Avg Rainfall')
|
49 |
+
monthly_trends[['Humidity9am', 'Humidity3pm']].plot(ax=axes[1,2], title='Monthly Avg Humidity')
|
50 |
+
monthly_trends[['Pressure9am', 'Pressure3pm']].plot(ax=axes[1,3], title='Monthly Avg Pressure')
|
51 |
+
|
52 |
+
plt.tight_layout()
|
53 |
+
st.pyplot(fig)
|
54 |
+
|
55 |
+
def perform_missing_value_analysis(df):
|
56 |
+
missing_values_total = df.isnull().sum()
|
57 |
+
missing_values_percentage = (df.isnull().sum() / len(df)) * 100
|
58 |
+
missing_values_analysis = pd.DataFrame({'Total Missing': missing_values_total, 'Percentage Missing': missing_values_percentage})
|
59 |
+
|
60 |
+
st.write(missing_values_analysis.sort_values(by='Percentage Missing', ascending=False))
|
61 |
+
|
62 |
+
def perform_outlier_detection(df, key_columns):
|
63 |
+
outlier_analysis = {}
|
64 |
+
for col in key_columns:
|
65 |
+
Q1 = df[col].quantile(0.25)
|
66 |
+
Q3 = df[col].quantile(0.75)
|
67 |
+
IQR = Q3 - Q1
|
68 |
+
lower_bound = Q1 - 1.5 * IQR
|
69 |
+
upper_bound = Q3 + 1.5 * IQR
|
70 |
+
outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
|
71 |
+
outlier_analysis[col] = {
|
72 |
+
"Outliers": outliers.shape[0],
|
73 |
+
"Percentage": (outliers.shape[0] / df.shape[0]) * 100
|
74 |
+
}
|
75 |
+
st.write(outlier_analysis)
|
76 |
+
|
77 |
+
def perform_categorical_data_analysis(df):
|
78 |
+
categorical_columns = df.select_dtypes(include=['object']).columns
|
79 |
+
categorical_analysis = {col: df[col].value_counts() for col in categorical_columns}
|
80 |
+
st.write(categorical_analysis)
|
81 |
+
|
82 |
+
# Main Function
|
83 |
+
def main():
|
84 |
+
st.title("Exploratory Data Analysis - Weather Forecasting")
|
85 |
+
|
86 |
+
# Load and preprocess data
|
87 |
+
df = load_and_preprocess_data()
|
88 |
+
|
89 |
+
# Extract temporal features
|
90 |
+
df = perform_temporal_feature_extraction(df) # This should be called before using 'Year' column
|
91 |
+
|
92 |
+
# Checkboxes and plotting functions
|
93 |
+
if st.sidebar.checkbox("Show Data Distributions"):
|
94 |
+
plot_data_distributions(df)
|
95 |
+
if st.sidebar.checkbox("Show Correlation Heatmap"):
|
96 |
+
calculate_phi_k_correlation(df)
|
97 |
+
if st.sidebar.checkbox("Show Annual and Monthly Trends"):
|
98 |
+
plot_annual_and_monthly_trends(df)
|
99 |
+
if st.sidebar.checkbox("Show Missing Value Analysis"):
|
100 |
+
perform_missing_value_analysis(df)
|
101 |
+
if st.sidebar.checkbox("Show Outlier Analysis"):
|
102 |
+
key_columns = ['MinTemp', 'MaxTemp', 'Rainfall', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm']
|
103 |
+
perform_outlier_detection(df, key_columns)
|
104 |
+
if st.sidebar.checkbox("Show Categorical Data Analysis"):
|
105 |
+
perform_categorical_data_analysis(df)
|
106 |
+
|
107 |
+
if __name__ == '__main__':
|
108 |
+
main()
|
prediction.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# prediction.py
|
2 |
+
import pickle
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
# Load the saved model, pipeline, and label encoder
|
7 |
+
model, pipeline, le = None, None, None
|
8 |
+
|
9 |
+
def load_artifacts():
|
10 |
+
global model, pipeline, le
|
11 |
+
with open('xgboost_optimized_model.pkl', 'rb') as file:
|
12 |
+
model = pickle.load(file)
|
13 |
+
with open('pipeline.pkl', 'rb') as file:
|
14 |
+
pipeline = pickle.load(file)
|
15 |
+
with open('lerain.pkl', 'rb') as file:
|
16 |
+
le = pickle.load(file)
|
17 |
+
|
18 |
+
load_artifacts()
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
def predict_rainfall(humidity_3pm, rainfall, rain_today, temp_range, wind_gust_speed, pressure_9am, avg_pressure, humidity_change, avg_humidity):
|
23 |
+
# Prepare the feature vector
|
24 |
+
data = pd.DataFrame([[humidity_3pm, np.log(rainfall + 1), le.transform([rain_today])[0], temp_range,
|
25 |
+
wind_gust_speed, pressure_9am, avg_pressure, humidity_change, avg_humidity]],
|
26 |
+
columns=['Humidity3pm', 'Rainfall_log', 'RainToday', 'TempRange', 'WindGustSpeed',
|
27 |
+
'Pressure9am', 'AvgPressure', 'HumidityChange', 'AvgHumidity'])
|
28 |
+
|
29 |
+
# Apply transformations and make prediction
|
30 |
+
transformed_data = pipeline.transform(data)
|
31 |
+
prediction = model.predict(transformed_data)
|
32 |
+
return prediction[0]
|