File size: 2,920 Bytes
a17642c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
import mlflow
import dagshub

# Initialize DagsHub and MLflow for tracking
dagshub.init(repo_owner='alaa.hussein401', repo_name='Winemakers-Dilemma', mlflow=True)
mlflow.set_tracking_uri('https://dagshub.com/alaa.hussein401/Winemakers-Dilemma.mlflow')

# Start an MLflow run
with mlflow.start_run():
    # Load the dataset
    df = pd.read_csv('vineyard_weather_1948-2017.csv')

    # Preprocess and feature engineering
    df['DATE'] = pd.to_datetime(df['DATE'], format='%Y-%m-%d')
    df['year'] = df['DATE'].dt.year
    df['week_of_year'] = df['DATE'].dt.isocalendar().week

    # Filter data for weeks 35 to 40
    df_filtered = df[(df['week_of_year'] >= 35) & (df['week_of_year'] <= 40)]

    # Aggregate data by year and week number
    weekly_data = df_filtered.groupby(['year', 'week_of_year']).agg({
        'PRCP': 'sum',
        'TMAX': 'mean',
        'TMIN': 'mean',
    }).reset_index()

    # Feature engineering
    weekly_data['PRCP_lag1'] = weekly_data['PRCP'].shift(1)
    weekly_data['PRCP_lag2'] = weekly_data['PRCP'].shift(2)
    weekly_data['PRCP_MA3'] = weekly_data['PRCP'].rolling(window=3).mean()

    # Drop rows with NaN values that were created by shifts and rolling means
    weekly_data.dropna(inplace=True)

    # Select features for training
    features = ['TMAX', 'TMIN', 'PRCP_lag1', 'PRCP_lag2', 'PRCP_MA3']
    X = weekly_data[features]
    y = (weekly_data['PRCP'] >= 0.35).astype(int)

    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create a pipeline that first standardizes the data, then applies SMOTE, and finally fits the model
    pipeline = imbpipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('classifier', RandomForestClassifier(n_estimators=200, random_state=42))
    ])

    # Fit the pipeline
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Log parameters, metrics, and artifacts
    mlflow.log_params(pipeline.named_steps['classifier'].get_params())
    mlflow.log_metric('accuracy', pipeline.score(X_test, y_test))
    
    # Generate and log classification report
    report = classification_report(y_test, y_pred)
    mlflow.log_text(report, 'classification_report_3.txt')

    # Generate and log confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    mlflow.log_text(str(conf_matrix), 'confusion_matrix_3.txt')

    # Print the classification report and confusion matrix
    print(report)
    print(conf_matrix)