import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, confusion_matrix from imblearn.over_sampling import SMOTE from imblearn.pipeline import Pipeline as imbpipeline import mlflow import dagshub # Initialize DagsHub and MLflow for tracking dagshub.init(repo_owner='alaa.hussein401', repo_name='Winemakers-Dilemma', mlflow=True) mlflow.set_tracking_uri('https://dagshub.com/alaa.hussein401/Winemakers-Dilemma.mlflow') # Start an MLflow run with mlflow.start_run(): # Load the dataset df = pd.read_csv('vineyard_weather_1948-2017.csv') # Preprocess and feature engineering df['DATE'] = pd.to_datetime(df['DATE'], format='%Y-%m-%d') df['year'] = df['DATE'].dt.year df['week_of_year'] = df['DATE'].dt.isocalendar().week # Filter data for weeks 35 to 40 df_filtered = df[(df['week_of_year'] >= 35) & (df['week_of_year'] <= 40)] # Aggregate data by year and week number weekly_data = df_filtered.groupby(['year', 'week_of_year']).agg({ 'PRCP': 'sum', 'TMAX': 'mean', 'TMIN': 'mean', }).reset_index() # Feature engineering weekly_data['PRCP_lag1'] = weekly_data['PRCP'].shift(1) weekly_data['PRCP_lag2'] = weekly_data['PRCP'].shift(2) weekly_data['PRCP_MA3'] = weekly_data['PRCP'].rolling(window=3).mean() # Drop rows with NaN values that were created by shifts and rolling means weekly_data.dropna(inplace=True) # Select features for training features = ['TMAX', 'TMIN', 'PRCP_lag1', 'PRCP_lag2', 'PRCP_MA3'] X = weekly_data[features] y = (weekly_data['PRCP'] >= 0.35).astype(int) # Split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Create a pipeline that first standardizes the data, then applies SMOTE, and finally fits the model pipeline = imbpipeline([ ('scaler', StandardScaler()), ('smote', SMOTE(random_state=42)), ('classifier', RandomForestClassifier(n_estimators=200, random_state=42)) ]) # Fit the pipeline pipeline.fit(X_train, y_train) # Make predictions y_pred = pipeline.predict(X_test) # Log parameters, metrics, and artifacts mlflow.log_params(pipeline.named_steps['classifier'].get_params()) mlflow.log_metric('accuracy', pipeline.score(X_test, y_test)) # Generate and log classification report report = classification_report(y_test, y_pred) mlflow.log_text(report, 'classification_report_3.txt') # Generate and log confusion matrix conf_matrix = confusion_matrix(y_test, y_pred) mlflow.log_text(str(conf_matrix), 'confusion_matrix_3.txt') # Print the classification report and confusion matrix print(report) print(conf_matrix)