7sugiwa commited on
Commit
a9d560d
1 Parent(s): cce1668

Add application file

Browse files
Files changed (3) hide show
  1. app.py +47 -0
  2. eda.py +108 -0
  3. prediction.py +32 -0
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import numpy as np
5
+ import prediction
6
+ import eda # Import the eda module
7
+
8
+ # Set up the main structure of the app
9
+ def main():
10
+ st.title("Rainfall Prediction in Australia")
11
+ st.write("This app predicts whether it will rain tomorrow in Australia based on weather data.")
12
+
13
+ # Sidebar navigation
14
+ st.sidebar.title("Navigation")
15
+ app_mode = st.sidebar.selectbox("Choose the app mode",
16
+ ["Home", "Exploratory Data Analysis", "Make a Prediction"])
17
+
18
+ if app_mode == "Home":
19
+ st.write("Welcome to the Weather Forecasting Application!")
20
+ st.write("Navigate to different sections using the sidebar.")
21
+ elif app_mode == "Exploratory Data Analysis":
22
+ st.subheader("Exploratory Data Analysis")
23
+ # Call the EDA function from eda.py
24
+ eda.main() # Call the main function from eda.py
25
+ elif app_mode == "Make a Prediction":
26
+ st.subheader("Make a Prediction")
27
+ # Get user input for prediction
28
+ user_input = get_user_input()
29
+ if st.button("Predict"):
30
+ # Call the prediction function from prediction.py
31
+ result = prediction.predict_rainfall(*user_input)
32
+ st.write(f"Prediction: {'It will rain tomorrow.' if result else 'No rain tomorrow.'}")
33
+
34
+ def get_user_input():
35
+ humidity_3pm = st.number_input('Humidity at 3 PM', min_value=0, max_value=100, value=50)
36
+ rainfall = st.number_input('Rainfall (mm)', min_value=0.0, max_value=1000.0, value=0.0)
37
+ rain_today = st.selectbox('Did it rain today?', options=['Yes', 'No'])
38
+ temp_range = st.number_input('Temperature Range (°C)', min_value=0.0, max_value=50.0, value=10.0)
39
+ wind_gust_speed = st.number_input('Wind Gust Speed (km/h)', min_value=0, max_value=100, value=20)
40
+ pressure_9am = st.number_input('Pressure at 9 AM (hPa)', min_value=980, max_value=1040, value=1010)
41
+ avg_pressure = st.number_input('Average Daily Pressure (hPa)', min_value=980, max_value=1040, value=1010)
42
+ humidity_change = st.number_input('Change in Humidity', min_value=-100, max_value=100, value=0)
43
+ avg_humidity = st.number_input('Average Daily Humidity', min_value=0, max_value=100, value=50)
44
+ return humidity_3pm, rainfall, rain_today, temp_range, wind_gust_speed, pressure_9am, avg_pressure, humidity_change, avg_humidity
45
+
46
+ if __name__ == "__main__":
47
+ main()
eda.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # eda.py
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import phik # Ensure this library is installed
7
+
8
+ @st.cache_data
9
+ def load_and_preprocess_data():
10
+ df = pd.read_csv("weatherAUS.csv")
11
+ return df
12
+
13
+ def plot_data_distributions(df):
14
+ sns.set_style('whitegrid')
15
+ for column in df.columns:
16
+ plt.figure(figsize=(8,4))
17
+ if len(df[column].unique()) > 10:
18
+ sns.histplot(df[column], kde=True, color='skyblue')
19
+ plt.title(f'Distribution of {column}')
20
+ else:
21
+ sns.countplot(x=column, data=df, palette='Set2')
22
+ plt.title(f'Count of different classes in {column}')
23
+ st.pyplot(plt)
24
+
25
+ def calculate_phi_k_correlation(df):
26
+ phi_k_correlation = df.phik_matrix()
27
+ plt.figure(figsize=(12, 10))
28
+ sns.heatmap(phi_k_correlation, annot=True, fmt=".2f", linewidths=.5, cmap='coolwarm')
29
+ plt.title('Phi_k Correlation Matrix Heatmap')
30
+ st.pyplot(plt)
31
+
32
+ def perform_temporal_feature_extraction(df):
33
+ df['Date'] = pd.to_datetime(df['Date'])
34
+ df['Year'] = df['Date'].dt.year
35
+ df['Month'] = df['Date'].dt.month
36
+ return df
37
+
38
+ def plot_annual_and_monthly_trends(df):
39
+ annual_trends = df.groupby('Year')[['MinTemp', 'MaxTemp', 'Rainfall', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm']].mean()
40
+ monthly_trends = df.groupby('Month')[['MinTemp', 'MaxTemp', 'Rainfall', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm']].mean()
41
+
42
+ fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))
43
+ annual_trends[['MinTemp', 'MaxTemp']].plot(ax=axes[0,0], title='Annual Avg Temperature')
44
+ annual_trends['Rainfall'].plot(ax=axes[0,1], title='Annual Avg Rainfall')
45
+ annual_trends[['Humidity9am', 'Humidity3pm']].plot(ax=axes[0,2], title='Annual Avg Humidity')
46
+ annual_trends[['Pressure9am', 'Pressure3pm']].plot(ax=axes[0,3], title='Annual Avg Pressure')
47
+ monthly_trends[['MinTemp', 'MaxTemp']].plot(ax=axes[1,0], title='Monthly Avg Temperature')
48
+ monthly_trends['Rainfall'].plot(ax=axes[1,1], title='Monthly Avg Rainfall')
49
+ monthly_trends[['Humidity9am', 'Humidity3pm']].plot(ax=axes[1,2], title='Monthly Avg Humidity')
50
+ monthly_trends[['Pressure9am', 'Pressure3pm']].plot(ax=axes[1,3], title='Monthly Avg Pressure')
51
+
52
+ plt.tight_layout()
53
+ st.pyplot(fig)
54
+
55
+ def perform_missing_value_analysis(df):
56
+ missing_values_total = df.isnull().sum()
57
+ missing_values_percentage = (df.isnull().sum() / len(df)) * 100
58
+ missing_values_analysis = pd.DataFrame({'Total Missing': missing_values_total, 'Percentage Missing': missing_values_percentage})
59
+
60
+ st.write(missing_values_analysis.sort_values(by='Percentage Missing', ascending=False))
61
+
62
+ def perform_outlier_detection(df, key_columns):
63
+ outlier_analysis = {}
64
+ for col in key_columns:
65
+ Q1 = df[col].quantile(0.25)
66
+ Q3 = df[col].quantile(0.75)
67
+ IQR = Q3 - Q1
68
+ lower_bound = Q1 - 1.5 * IQR
69
+ upper_bound = Q3 + 1.5 * IQR
70
+ outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
71
+ outlier_analysis[col] = {
72
+ "Outliers": outliers.shape[0],
73
+ "Percentage": (outliers.shape[0] / df.shape[0]) * 100
74
+ }
75
+ st.write(outlier_analysis)
76
+
77
+ def perform_categorical_data_analysis(df):
78
+ categorical_columns = df.select_dtypes(include=['object']).columns
79
+ categorical_analysis = {col: df[col].value_counts() for col in categorical_columns}
80
+ st.write(categorical_analysis)
81
+
82
+ # Main Function
83
+ def main():
84
+ st.title("Exploratory Data Analysis - Weather Forecasting")
85
+
86
+ # Load and preprocess data
87
+ df = load_and_preprocess_data()
88
+
89
+ # Extract temporal features
90
+ df = perform_temporal_feature_extraction(df) # This should be called before using 'Year' column
91
+
92
+ # Checkboxes and plotting functions
93
+ if st.sidebar.checkbox("Show Data Distributions"):
94
+ plot_data_distributions(df)
95
+ if st.sidebar.checkbox("Show Correlation Heatmap"):
96
+ calculate_phi_k_correlation(df)
97
+ if st.sidebar.checkbox("Show Annual and Monthly Trends"):
98
+ plot_annual_and_monthly_trends(df)
99
+ if st.sidebar.checkbox("Show Missing Value Analysis"):
100
+ perform_missing_value_analysis(df)
101
+ if st.sidebar.checkbox("Show Outlier Analysis"):
102
+ key_columns = ['MinTemp', 'MaxTemp', 'Rainfall', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm']
103
+ perform_outlier_detection(df, key_columns)
104
+ if st.sidebar.checkbox("Show Categorical Data Analysis"):
105
+ perform_categorical_data_analysis(df)
106
+
107
+ if __name__ == '__main__':
108
+ main()
prediction.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # prediction.py
2
+ import pickle
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ # Load the saved model, pipeline, and label encoder
7
+ model, pipeline, le = None, None, None
8
+
9
+ def load_artifacts():
10
+ global model, pipeline, le
11
+ with open('xgboost_optimized_model.pkl', 'rb') as file:
12
+ model = pickle.load(file)
13
+ with open('pipeline.pkl', 'rb') as file:
14
+ pipeline = pickle.load(file)
15
+ with open('lerain.pkl', 'rb') as file:
16
+ le = pickle.load(file)
17
+
18
+ load_artifacts()
19
+
20
+
21
+
22
+ def predict_rainfall(humidity_3pm, rainfall, rain_today, temp_range, wind_gust_speed, pressure_9am, avg_pressure, humidity_change, avg_humidity):
23
+ # Prepare the feature vector
24
+ data = pd.DataFrame([[humidity_3pm, np.log(rainfall + 1), le.transform([rain_today])[0], temp_range,
25
+ wind_gust_speed, pressure_9am, avg_pressure, humidity_change, avg_humidity]],
26
+ columns=['Humidity3pm', 'Rainfall_log', 'RainToday', 'TempRange', 'WindGustSpeed',
27
+ 'Pressure9am', 'AvgPressure', 'HumidityChange', 'AvgHumidity'])
28
+
29
+ # Apply transformations and make prediction
30
+ transformed_data = pipeline.transform(data)
31
+ prediction = model.predict(transformed_data)
32
+ return prediction[0]