Winemakers-Diemma / data_preparation_3.py
alaahussein's picture
Upload folder using huggingface_hub
a17642c verified
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
import mlflow
import dagshub
# Initialize DagsHub and MLflow for tracking
dagshub.init(repo_owner='alaa.hussein401', repo_name='Winemakers-Dilemma', mlflow=True)
mlflow.set_tracking_uri('https://dagshub.com/alaa.hussein401/Winemakers-Dilemma.mlflow')
# Start an MLflow run
with mlflow.start_run():
# Load the dataset
df = pd.read_csv('vineyard_weather_1948-2017.csv')
# Preprocess and feature engineering
df['DATE'] = pd.to_datetime(df['DATE'], format='%Y-%m-%d')
df['year'] = df['DATE'].dt.year
df['week_of_year'] = df['DATE'].dt.isocalendar().week
# Filter data for weeks 35 to 40
df_filtered = df[(df['week_of_year'] >= 35) & (df['week_of_year'] <= 40)]
# Aggregate data by year and week number
weekly_data = df_filtered.groupby(['year', 'week_of_year']).agg({
'PRCP': 'sum',
'TMAX': 'mean',
'TMIN': 'mean',
}).reset_index()
# Feature engineering
weekly_data['PRCP_lag1'] = weekly_data['PRCP'].shift(1)
weekly_data['PRCP_lag2'] = weekly_data['PRCP'].shift(2)
weekly_data['PRCP_MA3'] = weekly_data['PRCP'].rolling(window=3).mean()
# Drop rows with NaN values that were created by shifts and rolling means
weekly_data.dropna(inplace=True)
# Select features for training
features = ['TMAX', 'TMIN', 'PRCP_lag1', 'PRCP_lag2', 'PRCP_MA3']
X = weekly_data[features]
y = (weekly_data['PRCP'] >= 0.35).astype(int)
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a pipeline that first standardizes the data, then applies SMOTE, and finally fits the model
pipeline = imbpipeline([
('scaler', StandardScaler()),
('smote', SMOTE(random_state=42)),
('classifier', RandomForestClassifier(n_estimators=200, random_state=42))
])
# Fit the pipeline
pipeline.fit(X_train, y_train)
# Make predictions
y_pred = pipeline.predict(X_test)
# Log parameters, metrics, and artifacts
mlflow.log_params(pipeline.named_steps['classifier'].get_params())
mlflow.log_metric('accuracy', pipeline.score(X_test, y_test))
# Generate and log classification report
report = classification_report(y_test, y_pred)
mlflow.log_text(report, 'classification_report_3.txt')
# Generate and log confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
mlflow.log_text(str(conf_matrix), 'confusion_matrix_3.txt')
# Print the classification report and confusion matrix
print(report)
print(conf_matrix)